Retrieve a list of words from a website and show a word count plus a specified number of most frequently occurring words

I have to: 1.Retrieve the document text from the web (provided by utility class) 2.Filter the desired "words" form the document, and one by one, store each word as a key into a Map<String,Integer> object where the value is the number of occurrences of the word 3. Read the (word, num_occurrences) map entry pairs into an array/list structure of your choice 4. sort pairlist in a manner which sorts by num_occurrences 5. print: the total number of words processed, the number of unique words, the N pairs which have the largest number of occurrences.

Here's what I have so far -- The first class is the WebDoc utility class and the second is the main class. I have added blocks of commented out sections in which the new code should go. please help!

package util;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.URL;
import java.net.MalformedURLException;

public class WebDoc {

  public static String getBodyContent(String urlstr)
          throws MalformedURLException, IOException {
    /*
     * The following convoluted code is necessary because getParser()
     * is a protected method in HTMLEditorKit.

     * We create an anonymous extension of HTMLEditorKit with a public
     * getParser method calling the protected method of the superclass.
     */
    HTMLEditorKit.Parser parser = new HTMLEditorKit() {

      @Override
      public HTMLEditorKit.Parser getParser() {
        return super.getParser();
      }

    }.getParser();

    class DocStatus {
      public String content = "";
      public boolean body_started = false;
    }

    final DocStatus status = new DocStatus();

    HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() {

      // handle the tags: look for the BODY tag
      @Override
      public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
        if (t == HTML.Tag.BODY) {
          status.body_started = true;
        }
      }

      // handle the text between tags: concatenate all text after BODY tag
      @Override
      public void handleText(char[] text, int position) {
        if (status.body_started) {
          status.content += String.valueOf(text) + " ";
        }
      }
    };

    URL url = new URL(urlstr);

    InputStreamReader r = new InputStreamReader(url.openStream());
    parser.parse(r, callback, true);

    return status.content;
  }
}

package dsprog3;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import util.WebDoc;

public class DSProg3 {
    public static void main(String[] args) {
        String url;

        //test URLs
        url = "http://en.wikipedia.org/wiki/Jimi_Hendrix";

        final int N = 25; //the number of word/frequency pairs to print

        //word pattern recognizes a string of 5 or more letters
        String word_pattern = "[A-Za-z]{5,}";

        String content = null;
        try {
            content = WebDoc.getBodyContent(url); // get body of the web document
        } catch (Exception ex) {
            ex.printStackTrace();
            System.exit(1);
        }

        Map<String,Integer> wordCount = new HashMap<String,Integer>();

        int total_words = 0;
        Matcher match = Pattern.compile(word_pattern).matcher(content);
        while(match.find()){
            ++total_words;
            //get the next word which matches the word_pattern
            //and normalize it by making it lower case
            String word = match.group().toLowerCase();

            //System.out.println(word); //use this for testing

            /**ADD CODE
              *
              * "register" one more occurrence of key, word, in the wordCount map
              */    
        }


        //System.out.println(wordCount); //use this for testing

        //use this class as is or modify it
        class WordPair {
            String word;
            Integer count; // number of occurrences
            WordPair(String word, Integer count) {
                this.word = word;
                this.count = count;
            }
        }

        /**ADD CODE
         *
         * Create an array/list structure to hold WordPair objects
         * Iterate through wordCount and store the Map entry pairs
         * into the array/list structure
         */


        /**ADD CODE
         *
         * Create a comparator for WordPair objects which compares by
         * the count component
         *
         * Then sort the array/list using this comparator
         */


        /**ADD CODE
         *
         * Print
         *      total_words
         *      # of unique words
         *      the N entries in the array/list corresponding to the
         *      pairs with the highest count values
         */
    }

}

View Answers