Exercise 7-8
Change the cross-reference program to find all the URLs in a file, and write all the lines on which each distinct URL occurs.
Solution
This is surprisingly easy if you have completed the previous exercises. Chapter 7 of the book has provided the instruction to solve this. This exerise is about “doing it”.
This is the solution strategy:
- Use the entire project as per Solution to Exercise 7-7. i.e. main.cpp, split.cpp/.h, xref.cpp/.h.
- Add the find_url function components as per Solution to Exercise 6-0 (Part 4 / 7). i.e. find_urls.cpp/.h, not_url_char.cpp/.h, url_beg.cpp/.h, url_end.cpp/.h.
- Update the main function so that this bit of code…
// Call xref using split by default. map<string, vector<int> > ret = xref(cin);
… is replaced by this (using find_urls)….
// Call xref using find_urls (instead of the default split) map<string, vector<int> > ret = xref(cin, find_urls);
Though split.cpp and split.h will not be called directly, the function split may is included by the xref function. So for safety, I will leave it in the project.
The Project
In the following section I, for clarity sake, I shall include the entire project (C++ header and source files).

Source File List
Header File List
Source Files
main.cpp
#include <iostream> // std::cin, std::cout, std::endl
#include <map> // std::map
#include <string> // std::string
#include <vector> // std::vector
#include "xref.h" // xref
#include "find_urls.h" // find_urls
using std::cin;
using std::cout;
using std::endl;
using std::map;
using std::string;
using std::vector;
// Find all the lines that refer to each word in the input
// (S7.3/128)
int main() {
// Call xref using find_urls (instead of the default split)
map<string, vector<int> > ret = xref(cin, find_urls);
// Write the results.
for (map<string, vector<int> >::const_iterator it = ret.begin(); it != ret.end(); ++it) {
// Find number of lines the word has appeared
vector<int>::size_type numLines = (it->second).size();
// Write the word
cout << it->first << " occurs on ";
if (numLines == 1)
cout << "line: ";
else
cout << "lines: ";
// Followed by one or more line numbers.
vector<int>::const_iterator line_it = it->second.begin();
cout << *line_it; // write the first line number
++line_it;
// Write the rest of the line numbers, if any.
while (line_it != it->second.end()) {
cout << ", " << *line_it;
++line_it;
}
// Write a new line to separate each word from the next.
cout << endl;
}
return 0;
}
find_urls.cpp
#include <string> // string
#include <vector> // vector
#include "url_beg.h" // url_beg
#include "url_end.h" // url_end
using std::string;
using std::vector;
vector<string> find_urls(const string& s) {
vector<string> ret;
typedef string::const_iterator iter;
iter b = s.begin(), e = s.end();
// look through the entire input
while (b != e) {
// look for one or more letters followed by ://
b = url_beg(b, e);
// if we found it
if (b != e) {
// get the rest of the URL
iter after = url_end(b, e);
// remember the URL
ret.push_back(string(b, after));
// advance b and check for more URLs on this line
b = after;
}
} return ret;
}
not_url_char.cpp
#include <string> // string, isalnum
#include <algorithm> // find
using std::string;
bool not_url_char(char c) {
// characters, in addition to alphanumerics, that can appear in a URL
static const string url_ch = "~;/?:@=&$-_.+!*'(),";
// see whether c can appear in a URL and return the negative
return !(isalnum(c) || find(url_ch.begin(), url_ch.end(), c) != url_ch.end() );
}
split.cpp
#include <string> // string
#include <vector> // vector
#include <cctype> // isspace
#include <algorithm> // find_if
using std::vector;
using std::string;
using std::isspace;
using std::find_if;
// true if the argument is whitespace, false otherwise
// (S6.1.1/103)
bool space(char c) { return isspace(c); }
// false if the argument is whitespace, true otherwise
// (S6.1.1/103)
bool not_space(char c) { return !isspace(c); }
// Scan a line and split into words. Return a vector that contains these words.
// (S6.1.1/103)
vector<string> split(const string& str) {
typedef string::const_iterator iter; vector<string> ret;
iter i = str.begin(); while (i != str.end()) {
// Ignore leading blanks
i = find_if(i, str.end(), not_space);
// Find end of next word
iter j = find_if(i, str.end(), space);
// Copy the characters in ([i, j)
if (i != str.end())
ret.push_back(string(i, j));
// Re-initialize
i = j;
}
return ret;
}
url_beg.cpp
#include <string> // string, isalpha
#include <algorithm> // search
#include "not_url_char.h" // not_url_char
using std::string;
string::const_iterator url_beg(string::const_iterator b, string::const_iterator e) {
static const string sep = "://";
typedef string::const_iterator iter;
// i marks where the separator was found iter i = b;
while ((i = search(i, e, sep.begin(), sep.end() )) != e) {
// make sure the separator isn't at the beginning or end of the line
if (i != b && i + sep.size() != e) {
// beg marks the beginning of the protocol-name
iter beg = i;
while (beg != b && isalpha(beg[-1]))
--beg;
// is there at least one appropriate character before and after the separator?
if (beg != i && !not_url_char(i[sep.size()]))
return beg;
}
// the separator we found wasn't part of a URL; advance i past this separator
i += sep.size();
}
return e;
}
url_end.cpp
#include <string> // string
#include <vector> // vector
#include <algorithm> // find_if
#include "not_url_char.h" // not_url_char
using std::string;
string::const_iterator url_end(string::const_iterator b, string::const_iterator e) {
return find_if(b, e, not_url_char);
}
xref.cpp
#include <iostream> // std::cin, std::istream
#include <map> // std::map
#include <string> // std::string
#include <vector> // std::vector
#include <algorithm> // std::find
#include "split.h" // split
using std::cin;
using std::map;
using std::string;
using std::vector;
using std::istream;
using std::find;
// Find all the lines that refer to each word in the input
// In this program, the first line is represented by line 1,
// second line is represented by line 2, etc.
// (S7.3/126)
// Adjusted as per Exercise 7-3: store only non-duplicated line numbers.
map<string, vector<int> > xref(istream& in, vector<string> find_words(const string&) = split) {
string line; int line_number = 0; map<string, vector<int> > ret; // map string to vector<int>
// Read the next line while (getline(in, line)) { ++line_number;
// Break the input line into words
vector<string> words = find_words(line);
// remember that each word occurs on the current line
for (vector<string>::const_iterator it = words.begin();
it != words.end(); ++it) {
// store only non-duplicated line numbers.
if ( find( ret[*it].begin(), ret[*it].end(), line_number) == ret[*it].end() )
ret[*it].push_back(line_number);
}
}
return ret;
}
Header Files
find_urls.h
#ifndef GUARD_FIND_URLS_H #define GUARD_FIND_URLS_H #include <vector> #include <string> std::vector<std::string> find_urls(const std::string&); #endif // GUARD_FIND_URLS_H
not_url_char.h
#ifndef GUARD_NOT_URL_CHAR_H #define GUARD_NOT_URL_CHAR_H bool not_url_char(char); #endif // GUARD_NOT_URL_CHAR_H
split.h
#ifndef GUARD_SPLIT_H #define GUARD_SPLIT_H #include <vector> #include <string> std::vector<std::string> split(const std::string&); #endif // GUARD_SPLIT_H
url_beg.h
#ifndef GUARD_URL_BEG_H #define GUARD_URL_BEG_H std::string::const_iterator url_beg(std::string::const_iterator, std::string::const_iterator); #endif // GUARD_URL_BEG_H
url_end.h
#ifndef GUARD_URL_END_H #define GUARD_URL_END_H #include <string> std::string::const_iterator url_end(std::string::const_iterator, std::string::const_iterator); #endif // GUARD_URL_END_H
xref.h
#ifndef GUARD_XREF_H #define GUARD_XREF_H // xref.h #include <iostream> // std::istream #include <map> // std::map #include <string> // std::string #include <vector> // std::vector #include "split.h" // split std::map<std::string, std::vector<int> > xref(std::istream&, std::vector<std::string> find_words(const std::string&) = split); #endif // GUARD_XREF_H
Test Result
Run the program and submit some random text confirms the program works as expected.
hello world http://google.co.uk and http://bbc.co.uk and http://yahoo.com and maybe http://google.co.uk also http://bbc.co.uk and http://yahoo.com and of course http://google.co.uk and finally http://w3schools.com and http://google.com ^Z http://bbc.co.uk occurs on lines: 1, 3 http://google.co.uk occurs on lines: 1, 2, 5 http://google.com occurs on line: 7 http://w3schools.com occurs on line: 6 http://yahoo.com occurs on lines: 2, 4
Reference
Koenig, Andrew & Moo, Barbara E., Accelerated C++, Addison-Wesley, 2000
Testing markdown.
this is a strong text
this is italic text
this
int,double.int main() { code goes here... }“`