Exercise 7-8
Change the cross-reference program to find all the URLs in a file, and write all the lines on which each distinct URL occurs.
Solution
This is surprisingly easy if you have completed the previous exercises. Chapter 7 of the book has provided the instruction to solve this. This exerise is about “doing it”.
This is the solution strategy:
- Use the entire project as per Solution to Exercise 7-7. i.e. main.cpp, split.cpp/.h, xref.cpp/.h.
- Add the find_url function components as per Solution to Exercise 6-0 (Part 4 / 7). i.e. find_urls.cpp/.h, not_url_char.cpp/.h, url_beg.cpp/.h, url_end.cpp/.h.
- Update the main function so that this bit of code…
// Call xref using split by default. map<string, vector<int> > ret = xref(cin);
… is replaced by this (using find_urls)….
// Call xref using find_urls (instead of the default split) map<string, vector<int> > ret = xref(cin, find_urls);
Though split.cpp and split.h will not be called directly, the function split may is included by the xref function. So for safety, I will leave it in the project.
The Project
In the following section I, for clarity sake, I shall include the entire project (C++ header and source files).
Source File List
Header File List
Source Files
main.cpp
#include <iostream> // std::cin, std::cout, std::endl #include <map> // std::map #include <string> // std::string #include <vector> // std::vector #include "xref.h" // xref #include "find_urls.h" // find_urls using std::cin; using std::cout; using std::endl; using std::map; using std::string; using std::vector; // Find all the lines that refer to each word in the input // (S7.3/128) int main() { // Call xref using find_urls (instead of the default split) map<string, vector<int> > ret = xref(cin, find_urls); // Write the results. for (map<string, vector<int> >::const_iterator it = ret.begin(); it != ret.end(); ++it) { // Find number of lines the word has appeared vector<int>::size_type numLines = (it->second).size(); // Write the word cout << it->first << " occurs on "; if (numLines == 1) cout << "line: "; else cout << "lines: "; // Followed by one or more line numbers. vector<int>::const_iterator line_it = it->second.begin(); cout << *line_it; // write the first line number ++line_it; // Write the rest of the line numbers, if any. while (line_it != it->second.end()) { cout << ", " << *line_it; ++line_it; } // Write a new line to separate each word from the next. cout << endl; } return 0; }
find_urls.cpp
#include <string> // string #include <vector> // vector #include "url_beg.h" // url_beg #include "url_end.h" // url_end using std::string; using std::vector; vector<string> find_urls(const string& s) { vector<string> ret; typedef string::const_iterator iter; iter b = s.begin(), e = s.end(); // look through the entire input while (b != e) { // look for one or more letters followed by :// b = url_beg(b, e); // if we found it if (b != e) { // get the rest of the URL iter after = url_end(b, e); // remember the URL ret.push_back(string(b, after)); // advance b and check for more URLs on this line b = after; } } return ret; }
not_url_char.cpp
#include <string> // string, isalnum #include <algorithm> // find using std::string; bool not_url_char(char c) { // characters, in addition to alphanumerics, that can appear in a URL static const string url_ch = "~;/?:@=&$-_.+!*'(),"; // see whether c can appear in a URL and return the negative return !(isalnum(c) || find(url_ch.begin(), url_ch.end(), c) != url_ch.end() ); }
split.cpp
#include <string> // string #include <vector> // vector #include <cctype> // isspace #include <algorithm> // find_if using std::vector; using std::string; using std::isspace; using std::find_if; // true if the argument is whitespace, false otherwise // (S6.1.1/103) bool space(char c) { return isspace(c); } // false if the argument is whitespace, true otherwise // (S6.1.1/103) bool not_space(char c) { return !isspace(c); } // Scan a line and split into words. Return a vector that contains these words. // (S6.1.1/103) vector<string> split(const string& str) { typedef string::const_iterator iter; vector<string> ret; iter i = str.begin(); while (i != str.end()) { // Ignore leading blanks i = find_if(i, str.end(), not_space); // Find end of next word iter j = find_if(i, str.end(), space); // Copy the characters in ([i, j) if (i != str.end()) ret.push_back(string(i, j)); // Re-initialize i = j; } return ret; }
url_beg.cpp
#include <string> // string, isalpha #include <algorithm> // search #include "not_url_char.h" // not_url_char using std::string; string::const_iterator url_beg(string::const_iterator b, string::const_iterator e) { static const string sep = "://"; typedef string::const_iterator iter; // i marks where the separator was found iter i = b; while ((i = search(i, e, sep.begin(), sep.end() )) != e) { // make sure the separator isn't at the beginning or end of the line if (i != b && i + sep.size() != e) { // beg marks the beginning of the protocol-name iter beg = i; while (beg != b && isalpha(beg[-1])) --beg; // is there at least one appropriate character before and after the separator? if (beg != i && !not_url_char(i[sep.size()])) return beg; } // the separator we found wasn't part of a URL; advance i past this separator i += sep.size(); } return e; }
url_end.cpp
#include <string> // string #include <vector> // vector #include <algorithm> // find_if #include "not_url_char.h" // not_url_char using std::string; string::const_iterator url_end(string::const_iterator b, string::const_iterator e) { return find_if(b, e, not_url_char); }
xref.cpp
#include <iostream> // std::cin, std::istream #include <map> // std::map #include <string> // std::string #include <vector> // std::vector #include <algorithm> // std::find #include "split.h" // split using std::cin; using std::map; using std::string; using std::vector; using std::istream; using std::find; // Find all the lines that refer to each word in the input // In this program, the first line is represented by line 1, // second line is represented by line 2, etc. // (S7.3/126) // Adjusted as per Exercise 7-3: store only non-duplicated line numbers. map<string, vector<int> > xref(istream& in, vector<string> find_words(const string&) = split) { string line; int line_number = 0; map<string, vector<int> > ret; // map string to vector<int> // Read the next line while (getline(in, line)) { ++line_number; // Break the input line into words vector<string> words = find_words(line); // remember that each word occurs on the current line for (vector<string>::const_iterator it = words.begin(); it != words.end(); ++it) { // store only non-duplicated line numbers. if ( find( ret[*it].begin(), ret[*it].end(), line_number) == ret[*it].end() ) ret[*it].push_back(line_number); } } return ret; }
Header Files
find_urls.h
#ifndef GUARD_FIND_URLS_H #define GUARD_FIND_URLS_H #include <vector> #include <string> std::vector<std::string> find_urls(const std::string&); #endif // GUARD_FIND_URLS_H
not_url_char.h
#ifndef GUARD_NOT_URL_CHAR_H #define GUARD_NOT_URL_CHAR_H bool not_url_char(char); #endif // GUARD_NOT_URL_CHAR_H
split.h
#ifndef GUARD_SPLIT_H #define GUARD_SPLIT_H #include <vector> #include <string> std::vector<std::string> split(const std::string&); #endif // GUARD_SPLIT_H
url_beg.h
#ifndef GUARD_URL_BEG_H #define GUARD_URL_BEG_H std::string::const_iterator url_beg(std::string::const_iterator, std::string::const_iterator); #endif // GUARD_URL_BEG_H
url_end.h
#ifndef GUARD_URL_END_H #define GUARD_URL_END_H #include <string> std::string::const_iterator url_end(std::string::const_iterator, std::string::const_iterator); #endif // GUARD_URL_END_H
xref.h
#ifndef GUARD_XREF_H #define GUARD_XREF_H // xref.h #include <iostream> // std::istream #include <map> // std::map #include <string> // std::string #include <vector> // std::vector #include "split.h" // split std::map<std::string, std::vector<int> > xref(std::istream&, std::vector<std::string> find_words(const std::string&) = split); #endif // GUARD_XREF_H
Test Result
Run the program and submit some random text confirms the program works as expected.
hello world http://google.co.uk and http://bbc.co.uk and http://yahoo.com and maybe http://google.co.uk also http://bbc.co.uk and http://yahoo.com and of course http://google.co.uk and finally http://w3schools.com and http://google.com ^Z http://bbc.co.uk occurs on lines: 1, 3 http://google.co.uk occurs on lines: 1, 2, 5 http://google.com occurs on line: 7 http://w3schools.com occurs on line: 6 http://yahoo.com occurs on lines: 2, 4
Reference
Koenig, Andrew & Moo, Barbara E., Accelerated C++, Addison-Wesley, 2000