Accelerated C++ Solution to Exercise 7-8

Exercise 7-8

Change the cross-reference program to find all the URLs in a file, and write all the lines on which each distinct URL occurs.

Solution

This is surprisingly easy if you have completed the previous exercises. Chapter 7 of the book has provided the instruction to solve this. This exerise is about “doing it”.

This is the solution strategy:

  1. Use the entire project as per Solution to Exercise 7-7. i.e. main.cpp, split.cpp/.h, xref.cpp/.h.
  2. Add the find_url function components as per Solution to Exercise 6-0 (Part 4 / 7). i.e. find_urls.cpp/.h, not_url_char.cpp/.h, url_beg.cpp/.h, url_end.cpp/.h.
  3. Update the main function so that this bit of code…
// Call xref using split by default. 
map<string, vector<int> > ret = xref(cin);

… is replaced by this (using find_urls)….

// Call xref using find_urls (instead of the default split) 
map<string, vector<int> > ret = xref(cin, find_urls);

Though split.cpp and split.h will not be called directly, the function split may is included by the xref function. So for safety, I will leave it in the project.

The Project

In the following section I, for clarity sake, I shall include the entire project (C++ header and source files).

acpp7p8_MgntTree

Source File List

Header File List

Source Files

main.cpp

#include <iostream> // std::cin, std::cout, std::endl
#include <map> // std::map
#include <string> // std::string
#include <vector> // std::vector
#include "xref.h" // xref
#include "find_urls.h" // find_urls

using std::cin; 
using std::cout; 
using std::endl; 
using std::map; 
using std::string; 
using std::vector;

// Find all the lines that refer to each word in the input 
// (S7.3/128) 

int main() { 

  // Call xref using find_urls (instead of the default split) 
  map<string, vector<int> > ret = xref(cin, find_urls);

  // Write the results. 
  for (map<string, vector<int> >::const_iterator it = ret.begin(); it != ret.end(); ++it) {

    // Find number of lines the word has appeared
    vector<int>::size_type numLines = (it->second).size();

    // Write the word
    cout << it->first << " occurs on ";
    if (numLines == 1)
      cout << "line: ";
    else
      cout << "lines: ";

    // Followed by one or more line numbers.
    vector<int>::const_iterator line_it = it->second.begin();
    cout << *line_it;   // write the first line number

    ++line_it;

    // Write the rest of the line numbers, if any.
    while (line_it != it->second.end()) {
      cout << ", " << *line_it;
      ++line_it;
    }

    // Write a new line to separate each word from the next.
    cout << endl;

  }

  return 0; 
} 

find_urls.cpp

#include <string> // string
#include <vector> // vector
#include "url_beg.h" // url_beg
#include "url_end.h" // url_end

using std::string; 
using std::vector;

vector<string> find_urls(const string& s) { 

  vector<string> ret; 
  typedef string::const_iterator iter; 
  iter b = s.begin(), e = s.end();

  // look through the entire input 
  while (b != e) {

    // look for one or more letters followed by ://
    b = url_beg(b, e);

    // if we found it
    if (b != e) {
      // get the rest of the URL
      iter after = url_end(b, e);

      // remember the URL
      ret.push_back(string(b, after));

      // advance b and check for more URLs on this line
      b = after;
    }
  } return ret; 
}

not_url_char.cpp

#include <string> // string, isalnum
#include <algorithm> // find

using std::string;

bool not_url_char(char c) { 

  // characters, in addition to alphanumerics, that can appear in a URL 
  static const string url_ch = "~;/?:@=&$-_.+!*'(),";

  // see whether c can appear in a URL and return the negative 
  return !(isalnum(c) || find(url_ch.begin(), url_ch.end(), c) != url_ch.end() ); 

}

split.cpp

#include <string> // string
#include <vector> // vector
#include <cctype> // isspace
#include <algorithm> // find_if

using std::vector; 
using std::string; 
using std::isspace; 
using std::find_if;

// true if the argument is whitespace, false otherwise 
// (S6.1.1/103) 
bool space(char c) { return isspace(c); }

// false if the argument is whitespace, true otherwise 
// (S6.1.1/103) 
bool not_space(char c) { return !isspace(c); }

// Scan a line and split into words. Return a vector that contains these words. 
// (S6.1.1/103) 
vector<string> split(const string& str) { 
  typedef string::const_iterator iter; vector<string> ret;

  iter i = str.begin(); while (i != str.end()) {

  // Ignore leading blanks
  i = find_if(i, str.end(), not_space);

  // Find end of next word
  iter j = find_if(i, str.end(), space);

  // Copy the characters in ([i, j)
  if (i != str.end())
    ret.push_back(string(i, j));

  // Re-initialize
  i = j;

  } 

  return ret; 

}

url_beg.cpp

#include <string> // string, isalpha
#include <algorithm> // search
#include "not_url_char.h" // not_url_char

using std::string;

string::const_iterator url_beg(string::const_iterator b, string::const_iterator e) { 

  static const string sep = "://"; 
  typedef string::const_iterator iter;

  // i marks where the separator was found iter i = b;

  while ((i = search(i, e, sep.begin(), sep.end() )) != e) {

    // make sure the separator isn't at the beginning or end of the line
    if (i != b && i + sep.size() != e) {

      // beg marks the beginning of the protocol-name
      iter beg = i;
      while (beg != b && isalpha(beg[-1]))
        --beg;

      // is there at least one appropriate character before and after the separator?
      if (beg != i && !not_url_char(i[sep.size()]))
        return beg;
    }

    // the separator we found wasn't part of a URL; advance i past this separator
    i += sep.size();

  } 

  return e; 

}

url_end.cpp

#include <string> // string
#include <vector> // vector
#include <algorithm> // find_if
#include "not_url_char.h" // not_url_char

using std::string;

string::const_iterator url_end(string::const_iterator b, string::const_iterator e) { 
  return find_if(b, e, not_url_char); 
}

xref.cpp

#include <iostream> // std::cin, std::istream
#include <map> // std::map
#include <string> // std::string
#include <vector> // std::vector
#include <algorithm> // std::find
#include "split.h" // split

using std::cin; 
using std::map; 
using std::string; 
using std::vector; 
using std::istream; 
using std::find;

// Find all the lines that refer to each word in the input 
// In this program, the first line is represented by line 1, 
// second line is represented by line 2, etc. 
// (S7.3/126) 
// Adjusted as per Exercise 7-3: store only non-duplicated line numbers. 

map<string, vector<int> > xref(istream& in, vector<string> find_words(const string&) = split) { 

  string line; int line_number = 0; map<string, vector<int> > ret; // map string to vector<int>

  // Read the next line while (getline(in, line)) { ++line_number;

    // Break the input line into words
    vector<string> words = find_words(line);

    // remember that each word occurs on the current line
    for (vector<string>::const_iterator it = words.begin();
         it != words.end(); ++it) {

      // store only non-duplicated line numbers.
      if ( find( ret[*it].begin(), ret[*it].end(), line_number) == ret[*it].end() )
        ret[*it].push_back(line_number);
    }


  }

  return ret; 
}

Header Files

find_urls.h

#ifndef GUARD_FIND_URLS_H
#define GUARD_FIND_URLS_H
#include <vector>
#include <string>

std::vector<std::string> find_urls(const std::string&);

#endif // GUARD_FIND_URLS_H

not_url_char.h

#ifndef GUARD_NOT_URL_CHAR_H
#define GUARD_NOT_URL_CHAR_H

bool not_url_char(char);

#endif // GUARD_NOT_URL_CHAR_H

split.h

#ifndef GUARD_SPLIT_H
#define GUARD_SPLIT_H

#include <vector>
#include <string>

std::vector<std::string> split(const std::string&);

#endif // GUARD_SPLIT_H

url_beg.h

#ifndef GUARD_URL_BEG_H

#define GUARD_URL_BEG_H

std::string::const_iterator url_beg(std::string::const_iterator, std::string::const_iterator);

#endif // GUARD_URL_BEG_H

url_end.h

#ifndef GUARD_URL_END_H

#define GUARD_URL_END_H

#include <string>

std::string::const_iterator url_end(std::string::const_iterator, std::string::const_iterator);

#endif // GUARD_URL_END_H

xref.h

#ifndef GUARD_XREF_H

#define GUARD_XREF_H

// xref.h

#include <iostream> // std::istream
#include <map> // std::map
#include <string> // std::string
#include <vector> // std::vector
#include "split.h" // split

std::map<std::string, std::vector<int> > 
xref(std::istream&, std::vector<std::string> find_words(const std::string&) = split);

#endif // GUARD_XREF_H

Test Result

Run the program and submit some random text confirms the program works as expected.

hello world http://google.co.uk and http://bbc.co.uk
and http://yahoo.com and maybe http://google.co.uk
also http://bbc.co.uk
and http://yahoo.com
and of course http://google.co.uk
and finally http://w3schools.com
and http://google.com
^Z 
http://bbc.co.uk occurs on lines: 1, 3 
http://google.co.uk occurs on lines: 1, 2, 5 
http://google.com occurs on line: 7 
http://w3schools.com occurs on line: 6 
http://yahoo.com occurs on lines: 2, 4

Reference

Koenig, Andrew & Moo, Barbara E., Accelerated C++, Addison-Wesley, 2000