Advertisement
Guest User

Untitled

a guest
Oct 15th, 2019
95
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 19.52 KB | None | 0 0
  1. /********* LexiconData.java ********
  2. *
  3. * APCS Labs 2011-2020
  4. * Cryptology
  5. * Dr. John Pais
  6. * pais.john@gmail.com
  7. * Copyright (c) 2011 to present John Pais. All rights reserved.
  8. *
  9. */
  10.  
  11. package LexiconData;
  12. import java.util.*;
  13.  
  14. public class LexiconData
  15. {
  16. protected List<Character> alphabet;
  17. protected int alphaSize;
  18. // Create ntuples of individual plaintext words or (plaintext word,
  19. // known language word) pairs. Note that this will be determined
  20. // programmatically by a ReadWriteFile method (see below).
  21. private List<Ntuple> lexicon = new ArrayList<Ntuple>();
  22. private List<String> plaintextWords = new ArrayList<String>();
  23. private List<Ntuple> equivClassesSorted = new ArrayList<Ntuple>();
  24. private Map<String, Set<String>> mapWordToEquivClass = new HashMap<String, Set<String>>();
  25. private Map<String, Integer> mapWordToEquivClassSize = new HashMap<String, Integer>();
  26. private List<Ntuple> anagramClassesSorted = new ArrayList<Ntuple>();
  27. private Map<String, Set<String>> mapWordToAnagramClass = new HashMap<String, Set<String>>();
  28. private Map<String, Integer> mapWordToAnagramClassSize = new HashMap<String, Integer>();
  29.  
  30. public LexiconData(List<Character> alphabet, String dirPath, String lexicon, boolean init)
  31. {
  32. this.alphabet = alphabet;
  33. this.alphaSize = alphabet.size();
  34. readFileOfLexicon(dirPath + lexicon);
  35. createPlaintextWords();
  36. if(init)
  37. {
  38. writeEquivClassesSorted(dirPath + "equivClassesSorted.txt");
  39. writeAnagramClassesSorted(dirPath + "anagramClassesSorted.txt");
  40. }
  41. else
  42. {
  43. readEquivClassesSorted(dirPath + "equivClassesSorted.txt");
  44. readAnagramClassesSorted(dirPath + "anagramClassesSorted.txt");
  45. }
  46. mapWordToEquivClass();
  47. mapWordToEquivClassSize();
  48. mapWordToAnagramClass();
  49. mapWordToAnagramClassSize();
  50. }
  51.  
  52. // Problem 1. Create getter for alphabet.
  53. public List<Character> getAlphabet()
  54. {
  55. return alphabet;
  56. }
  57.  
  58. // Problem 2. Create getter for alphabet size.
  59. public int getAlphaSize()
  60. {
  61. return alphaSize;
  62. }
  63.  
  64. // Problem 3. Read lexicon into List of ntuples of individual words
  65. // or possibly (mystery word, english word) pairs. Note that this will be
  66. // determined programmatically by the ReadWriteFile method createNtupleLines,
  67. // which reads each line (record) of strings into an ntuple and creates an
  68. // ArrayList of these ntuples.
  69. public void readFileOfLexicon(String inputFile)
  70. {
  71. ReadWriteFile rwf = new ReadWriteFile(inputFile,3);
  72. lexicon = rwf.getNtupleLines();
  73. }
  74.  
  75. // Problem 4. Create plaintextWords.
  76. public void createPlaintextWords()
  77. {
  78. for (Ntuple ntuple : lexicon) {
  79. plaintextWords.add((String)ntuple.getkth(0));
  80. }
  81. }
  82.  
  83. // Problem 5. Create getter for plaintextWords.
  84. public List<String> getPlaintextWords()
  85. {
  86. return plaintextWords;
  87. }
  88.  
  89. // equivClass methods
  90.  
  91. // Problem 6. Create character set of a string.
  92. // Note that a set automatically removes duplicates.
  93. public Set<Character> charSet(String str)
  94. {
  95. Set<Character> set = new HashSet<>();
  96. for (Character c : str.toCharArray()) {
  97. set.add(c);
  98. }
  99. return set;
  100. }
  101.  
  102. // Problem 7. Create same character set test for two strings.
  103. public boolean sameCharSet(String str1, String str2)
  104. {
  105. return charSet(str1).equals(charSet(str1));
  106. }
  107.  
  108. // Problem 8. Create same character set equivalence
  109. // class of a given string, since sameCharSet is an
  110. // equivalence relation. Note that this is dependent
  111. // on the list of plaintextWords created using the
  112. // current lexicon.
  113. public Set<String> equivClass(String str)
  114. {
  115. Set<String> equivClass = new HashSet<>();
  116. for (String word : plaintextWords) {
  117. if (sameCharSet(word,str)) {
  118. equivClass.add(word);
  119. }
  120. }
  121. return equivClass;
  122. }
  123.  
  124. // Problem 9. Create the set of all equivalence classes
  125. // created from a given set of strings. Note that this
  126. // is dependent on the list of plaintextWords created
  127. // using the current lexicon.
  128. public Set<Set<String>> equivClasses(Set<String> set)
  129. {
  130. Set<Set<String>> equivClasses = new HashSet<Set<String>>();
  131. for (String str : set) {
  132. equivClasses.add(equivClass(str));
  133. }
  134. return equivClasses;
  135. }
  136.  
  137. // Problem 10. Create the equivalence class of size at least minSize
  138. // of a random string of length strLen.
  139. public Set<String> equivClassRndStr(int strLen, int minSize)
  140. {
  141. Random rnd = new Random();
  142. int index = 0;
  143. while (plaintextWords.get(index).length() != strLen ||
  144. equivClass(plaintextWords.get(index)).size() < minSize) {
  145. index = rnd.nextInt(plaintextWords.size());
  146. }
  147. return equivClass(plaintextWords.get(index));
  148. }
  149.  
  150. // Problem 11. Create a list of ntuples containing equivClass
  151. // and equivClass size pairs, sort the list by size, and write
  152. // it to disk. This is a computation intensive task that may
  153. // take several minutes, so we do this only once and write the
  154. // result to disk. Then we extract any information we from the
  155. // disk file.
  156. public void writeEquivClassesSorted(String outputFile)
  157. {
  158. List<Ntuple> equivClassesSorted = new ArrayList<Ntuple>();
  159. Set<String> equivClass = new HashSet<String>();
  160. String word;
  161. Ntuple ntuple0 = new Ntuple();
  162. for(Ntuple ntuple : lexicon)
  163. {
  164. word = (String)ntuple.getkth(0);
  165. equivClass = equivClass(word);
  166. ntuple0 = new Ntuple(word,equivClass,equivClass.size());
  167. equivClassesSorted.add(ntuple0);
  168. }
  169. NtupleComparator nc = new NtupleComparator(2,1,false);
  170. nc.sortNtupleList(equivClassesSorted);
  171. ReadWriteFile rwf = new ReadWriteFile();
  172. rwf.writeNtupleOutput(equivClassesSorted, outputFile);
  173. }
  174.  
  175. // Problem 12. Convert a string representation of an ntuple comprised
  176. // of a set of strings and the size of the set into an actual Ntuple
  177. // containing the actual set and its length.
  178. public Ntuple createNtupleFromNtupleStr(String ntupleStrSetPlusLen)
  179. {
  180. int wordStart = ntupleStrSetPlusLen.indexOf("(") + 1;
  181. int wordStop = ntupleStrSetPlusLen.indexOf(",");
  182. String word = ntupleStrSetPlusLen.substring(wordStart,wordStop);
  183. Set<String> set = new HashSet<String>();
  184. int setStart = ntupleStrSetPlusLen.indexOf("[") + 1;
  185. int setStop = ntupleStrSetPlusLen.indexOf("]");
  186. String str = ntupleStrSetPlusLen.substring(setStart,setStop);
  187. int index;
  188. while(str.length() > 0)
  189. {
  190. index = str.indexOf(",");
  191. if(index != -1)
  192. {
  193. set.add(str.substring(0, index));
  194. str = str.substring(str.indexOf(",")+2);
  195. }
  196. else
  197. {
  198. set.add(str);
  199. str = "";
  200. }
  201. }
  202. return new Ntuple(word,set,set.size());
  203. }
  204.  
  205. // Problem 13. Read the file created in Problem 11 into
  206. // a list of strings and then recreate it as a list of
  207. // ntuples coded into the variable equivClassesSorted.
  208. public void readEquivClassesSorted(String inputFile)
  209. {
  210. ReadWriteFile rwf = new ReadWriteFile(inputFile);
  211. List<String> lines = rwf.getFileLines();
  212. for(String str : lines)
  213. {
  214. equivClassesSorted.add(createNtupleFromNtupleStr(str));
  215. }
  216. }
  217.  
  218. // Problem 14. Create getter for equivClassesSorted.
  219. public List<Ntuple> getEquivClassesSorted()
  220. {
  221. return equivClassesSorted;
  222. }
  223.  
  224. // Problem 15. Create mapWordToEquivClass map.
  225. @SuppressWarnings("unchecked")
  226. public void mapWordToEquivClass()
  227. {
  228. for (Ntuple ntuple : equivClassesSorted) {
  229. mapWordToEquivClass.put((String) ntuple.getkth(0),(Set<String>) ntuple.getkth(1));
  230. }
  231. }
  232.  
  233. // Problem 16. Create getter for equivClass using mapWordToEquivClass map.
  234. public Set<String> getEquivClass(String word)
  235. {
  236. return mapWordToEquivClass.get(word);
  237. }
  238.  
  239. // Problem 17. Create mapWordToEquivClassSize map.
  240. @SuppressWarnings("unchecked")
  241. public void mapWordToEquivClassSize()
  242. {
  243. for (Ntuple ntuple : equivClassesSorted) {
  244. mapWordToEquivClassSize.put((String)ntuple.getkth(0),(Integer)ntuple.getkth(2));
  245. }
  246. }
  247.  
  248. // Problem 18. Create getter for equivClassSize using mapWordToEquivClassSize map.
  249. public int getEquivClassSize(String word)
  250. {
  251. return mapWordToEquivClassSize.get(word);
  252. }
  253.  
  254. // Problem 19. Create getter for total number of EquivClasses.
  255. public int getNumEquivClasses()
  256. {
  257. Set<Set<String>> sets = new HashSet<Set<String>>();
  258. for (Set<String> set : mapWordToEquivClass.values()) {
  259. sets.add(set);
  260. }
  261. return sets.size();
  262. }
  263.  
  264. // Problem 20. Create getter for equivClass max size.
  265. public int getEquivClassMaxSize()
  266. {
  267. return (int)equivClassesSorted.get(0).getkth(2);
  268. }
  269.  
  270. // anagramClass methods
  271.  
  272. // Problem 21. Count number of occurrences of a
  273. // character in a string.
  274. public int occurr(char ch, String str)
  275. {
  276. int count = 0;
  277. for (int i = 0; i < str.length(); i++) {
  278. if (str.charAt(i) == (ch)) {
  279. count++;
  280. }
  281. }
  282. return count;
  283. }
  284.  
  285. // Problem 22. Create test whether or not strX is an anagram of str.
  286. // You must use: 1. occur above, 2. str.toCharArray(), 3. enhanced for loop
  287. // You should mirror the methods above in the anagram methods below
  288. public boolean isAnagram(String strX, String str)
  289. {
  290. // if (sameCharSet(strX,str)) {
  291. // boolean a = true;
  292. // for (Character letter : charSet(strX)) {
  293. // if (occurr(letter, strX) != occurr(letter, str)) {
  294. // a = false;
  295. // break;
  296. // }
  297. // }
  298. // return a;
  299. // }
  300. // return false;
  301. if (strX.length() > str.length()) {
  302. for (char a : strX.toCharArray()) {
  303. if (occurr(a, strX) != occurr(a, str)) {
  304. return false;
  305. }
  306. }
  307. return true;
  308. }
  309. else {
  310. for (char a : str.toCharArray()) {
  311. if (occurr(a, strX) != occurr(a, str)) {
  312. return false;
  313. }
  314. }
  315. return true;
  316. }
  317. }
  318.  
  319. // Problem 23. Create anagram equivalence class of a given string,
  320. // which refines the sameCharSet equivalence relation.
  321. public Set<String> anagramClass(String str)
  322. {
  323. Set<String> anagramClass = new HashSet<>();
  324. for (String word : plaintextWords) {
  325. if (isAnagram(word,str)) {
  326. anagramClass.add(word);
  327. }
  328. }
  329. return anagramClass;
  330. }
  331.  
  332. // Problem 24. Create the set of all anagram classes
  333. // created from a given set of strings. Note that this
  334. // is dependent on the list of plaintextWords created
  335. // using the current lexicon.
  336. public Set<Set<String>> anagramClasses(Set<String> set)
  337. {
  338. Set<Set<String>> anagramClasses = new HashSet<Set<String>>();
  339. // String str = "stoops";
  340. for (String str : set) {
  341. // System.out.print(str + " ");
  342. // System.out.println(isAnagram("p", "stoops"));
  343. // for (String str2 : )
  344. // if (isAnagram(str, ))
  345. anagramClasses.add(anagramClass(str));
  346. }
  347. System.out.println();
  348. return anagramClasses;
  349. }
  350.  
  351. // Problem 25. Create the anagram class of size at least minSize
  352. // of a random string of length strLen.
  353. public Set<String> anagramClassRndStr(int strLen, int minSize)
  354. {
  355. Random rnd = new Random();
  356. int index = 0;
  357. while (plaintextWords.get(index).length() != strLen ||
  358. anagramClass(plaintextWords.get(index)).size() < minSize) {
  359. index = rnd.nextInt(plaintextWords.size());
  360. }
  361. return anagramClass(plaintextWords.get(index));
  362. }
  363.  
  364. // Problem 26. Create a list of ntuples containing anagramClass
  365. // and anagramClass size pairs, sort the list by size, and write
  366. // it to disk. This is a computation intensive task that may
  367. // take several minutes, so we do this only once and write the
  368. // result to disk. Then we extract any information we from the
  369. // disk file.
  370. public void writeAnagramClassesSorted(String outputFile)
  371. {
  372. List<Ntuple> anagramClassesSorted = new ArrayList<Ntuple>();
  373. Set<String> anagramClass = new HashSet<String>();
  374. String word;
  375. Ntuple ntuple0 = new Ntuple();
  376. for(Ntuple ntuple : lexicon)
  377. {
  378. word = (String)ntuple.getkth(0);
  379. anagramClass = anagramClass(word);
  380. ntuple0 = new Ntuple(word,anagramClass,anagramClass.size());
  381. anagramClassesSorted.add(ntuple0);
  382. }
  383. NtupleComparator nc = new NtupleComparator(2,1,false);
  384. nc.sortNtupleList(anagramClassesSorted);
  385. ReadWriteFile rwf = new ReadWriteFile();
  386. rwf.writeNtupleOutput(anagramClassesSorted, outputFile);
  387. }
  388.  
  389. // Problem 27. Read the file created in Problem 26 into
  390. // a list of strings and then recreate it as a list of
  391. // ntuples coded into the variable anagramClassesSorted.
  392. public void readAnagramClassesSorted(String inputFile)
  393. {
  394. ReadWriteFile rwf = new ReadWriteFile(inputFile);
  395. List<String> lines = rwf.getFileLines();
  396. for(String str : lines)
  397. {
  398. anagramClassesSorted.add(createNtupleFromNtupleStr(str));
  399. }
  400. }
  401.  
  402. // Problem 28. Create getter for anagramClassesSorted.
  403. public List<Ntuple> getAnagramClassesSorted()
  404. {
  405. return anagramClassesSorted;
  406. }
  407.  
  408. // Problem 29. Create mapWordToEquivClass map.
  409. @SuppressWarnings("unchecked")
  410. public void mapWordToAnagramClass()
  411. {
  412. for (Ntuple ntuple : anagramClassesSorted) {
  413. mapWordToAnagramClass.put((String) ntuple.getkth(0),(Set<String>) ntuple.getkth(1));
  414. }
  415. }
  416.  
  417. // Problem 30. Create getter for anagramClass using mapWordToAnagramClass map.
  418. public Set<String> getAnagramClass(String word)
  419. {
  420. return mapWordToAnagramClass.get(word);
  421. }
  422.  
  423. // Problem 31. Create mapWordToAnagramClassSize map.
  424. @SuppressWarnings("unchecked")
  425. public void mapWordToAnagramClassSize()
  426. {
  427. for (Ntuple ntuple : anagramClassesSorted) {
  428. mapWordToAnagramClassSize.put((String)ntuple.getkth(0),(Integer)ntuple.getkth(2));
  429. }
  430. }
  431.  
  432. // Problem 32. Create getter for anagramClassSize using mapWordToAnagramClassSize map.
  433. public int getAnagramClassSize(String word)
  434. {
  435. return mapWordToAnagramClassSize.get(word);
  436. }
  437.  
  438. // Problem 33. Create getter for total number of AnagramClasses.
  439. public int getNumAnagramClasses()
  440. {
  441. Set<Set<String>> sets = new HashSet<Set<String>>();
  442. for (Set<String> set : mapWordToAnagramClass.values()) {
  443. sets.add(set);
  444. }
  445. return sets.size();
  446. }
  447.  
  448. // Problem 34. Create getter for equivClass max size.
  449. public int getAnagramClassMaxSize()
  450. {
  451. return (int)anagramClassesSorted.get(0).getkth(2);
  452. }
  453.  
  454. // lexicon stats
  455.  
  456. // Problem 35. Create array of alphabet character counts for a given string.
  457. public double[] charCountArray(String str)
  458. {
  459. double[] arr = new double[26];
  460. for (int i = 0; i < str.length(); i++) {
  461. arr[Character.getNumericValue(str.charAt(i))-10]++;
  462. }
  463. return arr;
  464. }
  465.  
  466. // Problem 36. Create array of alphabet character count totals
  467. // for all strings in lexicon keySet.
  468. public double[] charCountArrayTotal()
  469. {
  470. double[] arr = new double[26];
  471. for (String str : plaintextWords) {
  472. for (int i = 0; i < str.length(); i++) {
  473. if (Character.isLetter(str.charAt(i))) {
  474. arr[Character.getNumericValue(str.charAt(i))-10]++;
  475. }
  476. }
  477. }
  478. return arr;
  479. }
  480.  
  481. // Problem 37. Create array of alphabet character % totals
  482. // for all strings in lexicon keySet.
  483. public double[] charPercentArray()
  484. {
  485. double[] arr = new double[26];
  486. for (String str : plaintextWords) {
  487. for (int i = 0; i < str.length(); i++) {
  488. if (Character.isLetter(str.charAt(i))) {
  489. arr[Character.getNumericValue(str.charAt(i))-10]++;
  490. }
  491. }
  492. }
  493. double sum = 0;
  494. for (int i = 0; i < arr.length; i++) {
  495. sum += arr[i];
  496. }
  497. for (int i = 0; i < arr.length; i++) {
  498. arr[i] *= 100/sum;
  499. }
  500. return arr;
  501. }
  502.  
  503. // Problem 38. Count (recursively) the number of occurrences of str1 in str2.
  504. public int countOccurr(String str1, String str2)
  505. {
  506. if (str1.length() > str2.length()) {
  507. return 0;
  508. }
  509. else if (str2.substring(0,str1.length()).equals(str1)) {
  510. return 1 + countOccurr(str1, str2.substring(1));
  511. }
  512. else return countOccurr(str1, str2.substring(1));
  513. }
  514.  
  515. // Problem 39. Create ntuples of bigrams (pairs of characters)
  516. // sorted by percent.
  517. public List<Ntuple> bigramPercentSortedNtuples()
  518. {
  519. ArrayList<Ntuple> ntuples = new ArrayList<Ntuple>();
  520. List<String> bigrams = new ArrayList<String>();
  521.  
  522. // insert your code here
  523. }
  524.  
  525. // Problem 40. Create array of percentages of word lengths. The value at each
  526. // index i is the percentage of words of length i.
  527. public double[] wordLengthPercentArray(int maxLen)
  528. {
  529. double[] arr = new double[maxLen+1];
  530. for (String str : plaintextWords) {
  531. arr[str.length()]++;
  532. }
  533. double sum = 0;
  534. for (int i = 0; i < arr.length; i++) {
  535. sum += arr[i];
  536. }
  537. for (int i = 0; i < arr.length; i++) {
  538. arr[i] *= 100/sum;
  539. }
  540. return arr;
  541. }
  542.  
  543. // Problem 41. Create lexicon report.
  544. public void lexiconReport(String wordInLexicon)
  545. {
  546.  
  547. System.out.println("\nLexicon Data: Alphabet, Words, EquivClasses & AnigramClasses");
  548. System.out.println("getAlphabet() = " + getAlphabet());
  549. System.out.println("getAlphaSize() = " + getAlphaSize());
  550.  
  551.  
  552. System.out.println("getPlaintextWords() = " + getPlaintextWords().subList(0, 50));
  553. System.out.println("getPlaintextWords().size() = " + getPlaintextWords().size());
  554.  
  555. System.out.println("\ncreateNtupleFromNtupleStr = " + createNtupleFromNtupleStr("Ntuple(post,[stoops, opts, post, stop, stoop, spot, spots, tops, pots, stops, posts],11)"));
  556. System.out.println("getEquivClassesSorted().subList(0, 50) = " + getEquivClassesSorted().subList(0, 50));
  557.  
  558. System.out.println("getNumEquivClasses() = " + getNumEquivClasses());
  559. System.out.println("\nwordInLexicon = " + wordInLexicon);
  560. System.out.println("getEquivClass(wordInLexicon) = " + getEquivClass(wordInLexicon));
  561. System.out.println("getEquivClassSize(wordInLexicon) = " + getEquivClassSize(wordInLexicon));
  562. System.out.println("getEquivClassMaxSize = " + getEquivClassMaxSize());
  563.  
  564. System.out.println("\ngetAnagramClassesSorted().subList(0, 50) = " + getAnagramClassesSorted().subList(0, 50));
  565. System.out.println("getNumAnagramClasses() = " + getNumAnagramClasses());
  566. System.out.println("\nwordInLexicon = " + wordInLexicon);
  567. System.out.println("getAnagramClass(wordInLexicon) = " + getAnagramClass(wordInLexicon));
  568. System.out.println("getAnagramClassSize(wordInLexicon) = " + getAnagramClassSize(wordInLexicon));
  569. System.out.println("getAnagramClassMaxSize = " + getAnagramClassMaxSize());
  570. System.out.println("\nwordInLexicon = " + wordInLexicon);
  571. System.out.println("getEquivClass(wordInLexicon) = " + getEquivClass(wordInLexicon));
  572. System.out.println("anagramClasses(getEquivClass(wordInLexicon)) = " + anagramClasses(getEquivClass(wordInLexicon)));
  573. System.out.println("getAnagramClass(wordInLexicon) = " + getAnagramClass(wordInLexicon));
  574. System.out.println("equivClasses(getAnagramClass(wordInLexicon)) = " + anagramClasses(getEquivClass(wordInLexicon)));
  575. // System.out.println("equivClasses(getAnagramClass(wordInLexicon)) = " + equivClasses(getAnagramClass(wordInLexicon)));
  576.  
  577. System.out.println("\nLexicon Data: Character Counts & Percentages, Bigram Percentages and Word length Percentages");
  578. System.out.println("getAlphabet() = " + getAlphabet());
  579. System.out.println("wordInLexicon = " + wordInLexicon);
  580.  
  581. System.out.println("charCountArray(wordInLexicon) = " + Arrays.toString(charCountArray(wordInLexicon)));
  582. System.out.println("charCountArrayTotal() = " + Arrays.toString(charCountArrayTotal()));
  583. System.out.println("charPercentArray() = " + Arrays.toString(charPercentArray()));
  584. // List<Ntuple> bigramPercentSortedNtuples = bigramPercentSortedNtuples();
  585. // System.out.println("\nbigramPercentSortedNtuples() = " + bigramPercentSortedNtuples);
  586. // System.out.println("bigramPercentSortedNtuples().size() = " + bigramPercentSortedNtuples.size() +" (with nonzero percent out of " + getAlphaSize()*getAlphaSize() + " bigrams)");
  587. System.out.println("wordLengthPercentArray(27) = " + Arrays.toString(wordLengthPercentArray(27)));
  588.  
  589. }
  590.  
  591. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement