Advertisement
superpawko

Untitled

Oct 9th, 2016
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.63 KB | None | 0 0
  1. import java.io.BufferedReader;
  2. import java.io.FileReader;
  3. import java.util.ArrayList;
  4. import java.util.HashMap;
  5. import java.util.HashSet;
  6. import java.util.List;
  7. import java.util.Map;
  8. import java.util.Set;
  9. import java.util.StringTokenizer;
  10.  
  11.  
  12. /**
  13. * Make a concordance (an index of all the places where a word is used in a corpus like
  14. * Shakespeare's plays).
  15. *
  16. * Note: this class is NOT meant as an example of good coding style.
  17. * It may also have a few bugs lurking in it, because we'll be using it as an example
  18. * of debugging as well.
  19. */
  20. public class Concordance {
  21.  
  22. public static List<String> allLines = new ArrayList<String>();
  23.  
  24. public static void main(String[] args) throws Exception {
  25.  
  26. // Read in all of Shakespeare from a file
  27. BufferedReader reader = new BufferedReader(new FileReader("shakespeare.json"));
  28. String line = reader.readLine();
  29. while (line != null) {
  30. // Extract the play text from lines of the form:
  31. // "text_entry": "A bird of my tongue is better than a beast of yours.",
  32. if (line.contains("text_entry")) {
  33. // index 19 spaces into the line to get past "text_entry":
  34. final int lengthOfTextEntryCaption = " \"text_entry\": ".length();
  35. line = line.substring(lengthOfTextEntryCaption, line.length()-2); // keep only the part between double-quotes
  36. allLines.add(line);
  37. }
  38. }
  39. reader.close();
  40.  
  41. // Build an index that maps each word to the set of lines that contain the word
  42. Map<String, Set<String>> index = new HashMap<String, Set<String>>();
  43. for (String line2 : HERE) {
  44. StringTokenizer tokenizer = new StringTokenizer(
  45. line2, // string to split up into words ("tokens")
  46. " ,.!?;:()[]{}'\"-_+=<>/\\`~$|!@#$%^&*", // space & punctuation separates words
  47. false // don't keep the spaces and punctuation
  48. );
  49. while (tokenizer.hasMoreElements()) {
  50. String word = tokenizer.nextToken();
  51. word = word.toLowerCase();
  52. Set<String> linesContainingWord = index.get(word);
  53. if (linesContainingWord == null) {
  54. // First time we've seen this word -- create a set for it
  55. linesContainingWord = new HashSet<String>();
  56. index.put(word, linesContainingWord);
  57. } else {
  58. linesContainingWord.add(line);
  59. }
  60. }
  61. }
  62. }
  63. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement