I have to: 1.Retrieve the document text from the web (provided by utility class) 2.Filter the desired "words" form the document, and one by one, store each word as a key into a Map<String,Integer> object where the value is the number of occurrences of the word 3. Read the (word, num_occurrences) map entry pairs into an array/list structure of your choice 4. sort pairlist in a manner which sorts by num_occurrences 5. print: the total number of words processed, the number of unique words, the N pairs which have the largest number of occurrences.
Here's what I have so far -- The first class is the WebDoc utility class and the second is the main class. I have added blocks of commented out sections in which the new code should go. please help!
package util; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.HTML; import java.io.InputStreamReader; import java.io.IOException; import java.net.URL; import java.net.MalformedURLException; public class WebDoc { public static String getBodyContent(String urlstr) throws MalformedURLException, IOException { /* * The following convoluted code is necessary because getParser() * is a protected method in HTMLEditorKit. * We create an anonymous extension of HTMLEditorKit with a public * getParser method calling the protected method of the superclass. */ HTMLEditorKit.Parser parser = new HTMLEditorKit() { @Override public HTMLEditorKit.Parser getParser() { return super.getParser(); } }.getParser(); class DocStatus { public String content = ""; public boolean body_started = false; } final DocStatus status = new DocStatus(); HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() { // handle the tags: look for the BODY tag @Override public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { if (t == HTML.Tag.BODY) { status.body_started = true; } } // handle the text between tags: concatenate all text after BODY tag @Override public void handleText(char[] text, int position) { if (status.body_started) { status.content += String.valueOf(text) + " "; } } }; URL url = new URL(urlstr); InputStreamReader r = new InputStreamReader(url.openStream()); parser.parse(r, callback, true); return status.content; } }
package dsprog3; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import util.WebDoc; public class DSProg3 { public static void main(String[] args) { String url; //test URLs url = "http://en.wikipedia.org/wiki/Jimi_Hendrix"; final int N = 25; //the number of word/frequency pairs to print //word pattern recognizes a string of 5 or more letters String word_pattern = "[A-Za-z]{5,}"; String content = null; try { content = WebDoc.getBodyContent(url); // get body of the web document } catch (Exception ex) { ex.printStackTrace(); System.exit(1); } Map<String,Integer> wordCount = new HashMap<String,Integer>(); int total_words = 0; Matcher match = Pattern.compile(word_pattern).matcher(content); while(match.find()){ ++total_words; //get the next word which matches the word_pattern //and normalize it by making it lower case String word = match.group().toLowerCase(); //System.out.println(word); //use this for testing /**ADD CODE * * "register" one more occurrence of key, word, in the wordCount map */ } //System.out.println(wordCount); //use this for testing //use this class as is or modify it class WordPair { String word; Integer count; // number of occurrences WordPair(String word, Integer count) { this.word = word; this.count = count; } } /**ADD CODE * * Create an array/list structure to hold WordPair objects * Iterate through wordCount and store the Map entry pairs * into the array/list structure */ /**ADD CODE * * Create a comparator for WordPair objects which compares by * the count component * * Then sort the array/list using this comparator */ /**ADD CODE * * Print * total_words * # of unique words * the N entries in the array/list corresponding to the * pairs with the highest count values */ } }