Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /********* LexiconData.java ********
- *
- * APCS Labs 2011-2020
- * Cryptology
- * Dr. John Pais
- * pais.john@gmail.com
- * Copyright (c) 2011 to present John Pais. All rights reserved.
- *
- */
- package LexiconData;
- import java.util.*;
- public class LexiconData
- {
- protected List<Character> alphabet;
- protected int alphaSize;
- // Create ntuples of individual plaintext words or (plaintext word,
- // known language word) pairs. Note that this will be determined
- // programmatically by a ReadWriteFile method (see below).
- private List<Ntuple> lexicon = new ArrayList<Ntuple>();
- private List<String> plaintextWords = new ArrayList<String>();
- private List<Ntuple> equivClassesSorted = new ArrayList<Ntuple>();
- private Map<String, Set<String>> mapWordToEquivClass = new HashMap<String, Set<String>>();
- private Map<String, Integer> mapWordToEquivClassSize = new HashMap<String, Integer>();
- private List<Ntuple> anagramClassesSorted = new ArrayList<Ntuple>();
- private Map<String, Set<String>> mapWordToAnagramClass = new HashMap<String, Set<String>>();
- private Map<String, Integer> mapWordToAnagramClassSize = new HashMap<String, Integer>();
- public LexiconData(List<Character> alphabet, String dirPath, String lexicon, boolean init)
- {
- this.alphabet = alphabet;
- this.alphaSize = alphabet.size();
- readFileOfLexicon(dirPath + lexicon);
- createPlaintextWords();
- if(init)
- {
- writeEquivClassesSorted(dirPath + "equivClassesSorted.txt");
- writeAnagramClassesSorted(dirPath + "anagramClassesSorted.txt");
- }
- else
- {
- readEquivClassesSorted(dirPath + "equivClassesSorted.txt");
- readAnagramClassesSorted(dirPath + "anagramClassesSorted.txt");
- }
- mapWordToEquivClass();
- mapWordToEquivClassSize();
- mapWordToAnagramClass();
- mapWordToAnagramClassSize();
- }
- // Problem 1. Create getter for alphabet.
- public List<Character> getAlphabet()
- {
- return alphabet;
- }
- // Problem 2. Create getter for alphabet size.
- public int getAlphaSize()
- {
- return alphaSize;
- }
- // Problem 3. Read lexicon into List of ntuples of individual words
- // or possibly (mystery word, english word) pairs. Note that this will be
- // determined programmatically by the ReadWriteFile method createNtupleLines,
- // which reads each line (record) of strings into an ntuple and creates an
- // ArrayList of these ntuples.
- public void readFileOfLexicon(String inputFile)
- {
- ReadWriteFile rwf = new ReadWriteFile(inputFile,3);
- lexicon = rwf.getNtupleLines();
- }
- // Problem 4. Create plaintextWords.
- public void createPlaintextWords()
- {
- for (Ntuple ntuple : lexicon) {
- plaintextWords.add((String)ntuple.getkth(0));
- }
- }
- // Problem 5. Create getter for plaintextWords.
- public List<String> getPlaintextWords()
- {
- return plaintextWords;
- }
- // equivClass methods
- // Problem 6. Create character set of a string.
- // Note that a set automatically removes duplicates.
- public Set<Character> charSet(String str)
- {
- Set<Character> set = new HashSet<>();
- for (Character c : str.toCharArray()) {
- set.add(c);
- }
- return set;
- }
- // Problem 7. Create same character set test for two strings.
- public boolean sameCharSet(String str1, String str2)
- {
- return charSet(str1).equals(charSet(str1));
- }
- // Problem 8. Create same character set equivalence
- // class of a given string, since sameCharSet is an
- // equivalence relation. Note that this is dependent
- // on the list of plaintextWords created using the
- // current lexicon.
- public Set<String> equivClass(String str)
- {
- Set<String> equivClass = new HashSet<>();
- for (String word : plaintextWords) {
- if (sameCharSet(word,str)) {
- equivClass.add(word);
- }
- }
- return equivClass;
- }
- // Problem 9. Create the set of all equivalence classes
- // created from a given set of strings. Note that this
- // is dependent on the list of plaintextWords created
- // using the current lexicon.
- public Set<Set<String>> equivClasses(Set<String> set)
- {
- Set<Set<String>> equivClasses = new HashSet<Set<String>>();
- for (String str : set) {
- equivClasses.add(equivClass(str));
- }
- return equivClasses;
- }
- // Problem 10. Create the equivalence class of size at least minSize
- // of a random string of length strLen.
- public Set<String> equivClassRndStr(int strLen, int minSize)
- {
- Random rnd = new Random();
- int index = 0;
- while (plaintextWords.get(index).length() != strLen ||
- equivClass(plaintextWords.get(index)).size() < minSize) {
- index = rnd.nextInt(plaintextWords.size());
- }
- return equivClass(plaintextWords.get(index));
- }
- // Problem 11. Create a list of ntuples containing equivClass
- // and equivClass size pairs, sort the list by size, and write
- // it to disk. This is a computation intensive task that may
- // take several minutes, so we do this only once and write the
- // result to disk. Then we extract any information we from the
- // disk file.
- public void writeEquivClassesSorted(String outputFile)
- {
- List<Ntuple> equivClassesSorted = new ArrayList<Ntuple>();
- Set<String> equivClass = new HashSet<String>();
- String word;
- Ntuple ntuple0 = new Ntuple();
- for(Ntuple ntuple : lexicon)
- {
- word = (String)ntuple.getkth(0);
- equivClass = equivClass(word);
- ntuple0 = new Ntuple(word,equivClass,equivClass.size());
- equivClassesSorted.add(ntuple0);
- }
- NtupleComparator nc = new NtupleComparator(2,1,false);
- nc.sortNtupleList(equivClassesSorted);
- ReadWriteFile rwf = new ReadWriteFile();
- rwf.writeNtupleOutput(equivClassesSorted, outputFile);
- }
- // Problem 12. Convert a string representation of an ntuple comprised
- // of a set of strings and the size of the set into an actual Ntuple
- // containing the actual set and its length.
- public Ntuple createNtupleFromNtupleStr(String ntupleStrSetPlusLen)
- {
- int wordStart = ntupleStrSetPlusLen.indexOf("(") + 1;
- int wordStop = ntupleStrSetPlusLen.indexOf(",");
- String word = ntupleStrSetPlusLen.substring(wordStart,wordStop);
- Set<String> set = new HashSet<String>();
- int setStart = ntupleStrSetPlusLen.indexOf("[") + 1;
- int setStop = ntupleStrSetPlusLen.indexOf("]");
- String str = ntupleStrSetPlusLen.substring(setStart,setStop);
- int index;
- while(str.length() > 0)
- {
- index = str.indexOf(",");
- if(index != -1)
- {
- set.add(str.substring(0, index));
- str = str.substring(str.indexOf(",")+2);
- }
- else
- {
- set.add(str);
- str = "";
- }
- }
- return new Ntuple(word,set,set.size());
- }
- // Problem 13. Read the file created in Problem 11 into
- // a list of strings and then recreate it as a list of
- // ntuples coded into the variable equivClassesSorted.
- public void readEquivClassesSorted(String inputFile)
- {
- ReadWriteFile rwf = new ReadWriteFile(inputFile);
- List<String> lines = rwf.getFileLines();
- for(String str : lines)
- {
- equivClassesSorted.add(createNtupleFromNtupleStr(str));
- }
- }
- // Problem 14. Create getter for equivClassesSorted.
- public List<Ntuple> getEquivClassesSorted()
- {
- return equivClassesSorted;
- }
- // Problem 15. Create mapWordToEquivClass map.
- @SuppressWarnings("unchecked")
- public void mapWordToEquivClass()
- {
- for (Ntuple ntuple : equivClassesSorted) {
- mapWordToEquivClass.put((String) ntuple.getkth(0),(Set<String>) ntuple.getkth(1));
- }
- }
- // Problem 16. Create getter for equivClass using mapWordToEquivClass map.
- public Set<String> getEquivClass(String word)
- {
- return mapWordToEquivClass.get(word);
- }
- // Problem 17. Create mapWordToEquivClassSize map.
- @SuppressWarnings("unchecked")
- public void mapWordToEquivClassSize()
- {
- for (Ntuple ntuple : equivClassesSorted) {
- mapWordToEquivClassSize.put((String)ntuple.getkth(0),(Integer)ntuple.getkth(2));
- }
- }
- // Problem 18. Create getter for equivClassSize using mapWordToEquivClassSize map.
- public int getEquivClassSize(String word)
- {
- return mapWordToEquivClassSize.get(word);
- }
- // Problem 19. Create getter for total number of EquivClasses.
- public int getNumEquivClasses()
- {
- Set<Set<String>> sets = new HashSet<Set<String>>();
- for (Set<String> set : mapWordToEquivClass.values()) {
- sets.add(set);
- }
- return sets.size();
- }
- // Problem 20. Create getter for equivClass max size.
- public int getEquivClassMaxSize()
- {
- return (int)equivClassesSorted.get(0).getkth(2);
- }
- // anagramClass methods
- // Problem 21. Count number of occurrences of a
- // character in a string.
- public int occurr(char ch, String str)
- {
- int count = 0;
- for (int i = 0; i < str.length(); i++) {
- if (str.charAt(i) == (ch)) {
- count++;
- }
- }
- return count;
- }
- // Problem 22. Create test whether or not strX is an anagram of str.
- // You must use: 1. occur above, 2. str.toCharArray(), 3. enhanced for loop
- // You should mirror the methods above in the anagram methods below
- public boolean isAnagram(String strX, String str)
- {
- // if (sameCharSet(strX,str)) {
- // boolean a = true;
- // for (Character letter : charSet(strX)) {
- // if (occurr(letter, strX) != occurr(letter, str)) {
- // a = false;
- // break;
- // }
- // }
- // return a;
- // }
- // return false;
- if (strX.length() > str.length()) {
- for (char a : strX.toCharArray()) {
- if (occurr(a, strX) != occurr(a, str)) {
- return false;
- }
- }
- return true;
- }
- else {
- for (char a : str.toCharArray()) {
- if (occurr(a, strX) != occurr(a, str)) {
- return false;
- }
- }
- return true;
- }
- }
- // Problem 23. Create anagram equivalence class of a given string,
- // which refines the sameCharSet equivalence relation.
- public Set<String> anagramClass(String str)
- {
- Set<String> anagramClass = new HashSet<>();
- for (String word : plaintextWords) {
- if (isAnagram(word,str)) {
- anagramClass.add(word);
- }
- }
- return anagramClass;
- }
- // Problem 24. Create the set of all anagram classes
- // created from a given set of strings. Note that this
- // is dependent on the list of plaintextWords created
- // using the current lexicon.
- public Set<Set<String>> anagramClasses(Set<String> set)
- {
- Set<Set<String>> anagramClasses = new HashSet<Set<String>>();
- // String str = "stoops";
- for (String str : set) {
- // System.out.print(str + " ");
- // System.out.println(isAnagram("p", "stoops"));
- // for (String str2 : )
- // if (isAnagram(str, ))
- anagramClasses.add(anagramClass(str));
- }
- System.out.println();
- return anagramClasses;
- }
- // Problem 25. Create the anagram class of size at least minSize
- // of a random string of length strLen.
- public Set<String> anagramClassRndStr(int strLen, int minSize)
- {
- Random rnd = new Random();
- int index = 0;
- while (plaintextWords.get(index).length() != strLen ||
- anagramClass(plaintextWords.get(index)).size() < minSize) {
- index = rnd.nextInt(plaintextWords.size());
- }
- return anagramClass(plaintextWords.get(index));
- }
- // Problem 26. Create a list of ntuples containing anagramClass
- // and anagramClass size pairs, sort the list by size, and write
- // it to disk. This is a computation intensive task that may
- // take several minutes, so we do this only once and write the
- // result to disk. Then we extract any information we from the
- // disk file.
- public void writeAnagramClassesSorted(String outputFile)
- {
- List<Ntuple> anagramClassesSorted = new ArrayList<Ntuple>();
- Set<String> anagramClass = new HashSet<String>();
- String word;
- Ntuple ntuple0 = new Ntuple();
- for(Ntuple ntuple : lexicon)
- {
- word = (String)ntuple.getkth(0);
- anagramClass = anagramClass(word);
- ntuple0 = new Ntuple(word,anagramClass,anagramClass.size());
- anagramClassesSorted.add(ntuple0);
- }
- NtupleComparator nc = new NtupleComparator(2,1,false);
- nc.sortNtupleList(anagramClassesSorted);
- ReadWriteFile rwf = new ReadWriteFile();
- rwf.writeNtupleOutput(anagramClassesSorted, outputFile);
- }
- // Problem 27. Read the file created in Problem 26 into
- // a list of strings and then recreate it as a list of
- // ntuples coded into the variable anagramClassesSorted.
- public void readAnagramClassesSorted(String inputFile)
- {
- ReadWriteFile rwf = new ReadWriteFile(inputFile);
- List<String> lines = rwf.getFileLines();
- for(String str : lines)
- {
- anagramClassesSorted.add(createNtupleFromNtupleStr(str));
- }
- }
- // Problem 28. Create getter for anagramClassesSorted.
- public List<Ntuple> getAnagramClassesSorted()
- {
- return anagramClassesSorted;
- }
- // Problem 29. Create mapWordToEquivClass map.
- @SuppressWarnings("unchecked")
- public void mapWordToAnagramClass()
- {
- for (Ntuple ntuple : anagramClassesSorted) {
- mapWordToAnagramClass.put((String) ntuple.getkth(0),(Set<String>) ntuple.getkth(1));
- }
- }
- // Problem 30. Create getter for anagramClass using mapWordToAnagramClass map.
- public Set<String> getAnagramClass(String word)
- {
- return mapWordToAnagramClass.get(word);
- }
- // Problem 31. Create mapWordToAnagramClassSize map.
- @SuppressWarnings("unchecked")
- public void mapWordToAnagramClassSize()
- {
- for (Ntuple ntuple : anagramClassesSorted) {
- mapWordToAnagramClassSize.put((String)ntuple.getkth(0),(Integer)ntuple.getkth(2));
- }
- }
- // Problem 32. Create getter for anagramClassSize using mapWordToAnagramClassSize map.
- public int getAnagramClassSize(String word)
- {
- return mapWordToAnagramClassSize.get(word);
- }
- // Problem 33. Create getter for total number of AnagramClasses.
- public int getNumAnagramClasses()
- {
- Set<Set<String>> sets = new HashSet<Set<String>>();
- for (Set<String> set : mapWordToAnagramClass.values()) {
- sets.add(set);
- }
- return sets.size();
- }
- // Problem 34. Create getter for equivClass max size.
- public int getAnagramClassMaxSize()
- {
- return (int)anagramClassesSorted.get(0).getkth(2);
- }
- // lexicon stats
- // Problem 35. Create array of alphabet character counts for a given string.
- public double[] charCountArray(String str)
- {
- double[] arr = new double[26];
- for (int i = 0; i < str.length(); i++) {
- arr[Character.getNumericValue(str.charAt(i))-10]++;
- }
- return arr;
- }
- // Problem 36. Create array of alphabet character count totals
- // for all strings in lexicon keySet.
- public double[] charCountArrayTotal()
- {
- double[] arr = new double[26];
- for (String str : plaintextWords) {
- for (int i = 0; i < str.length(); i++) {
- if (Character.isLetter(str.charAt(i))) {
- arr[Character.getNumericValue(str.charAt(i))-10]++;
- }
- }
- }
- return arr;
- }
- // Problem 37. Create array of alphabet character % totals
- // for all strings in lexicon keySet.
- public double[] charPercentArray()
- {
- double[] arr = new double[26];
- for (String str : plaintextWords) {
- for (int i = 0; i < str.length(); i++) {
- if (Character.isLetter(str.charAt(i))) {
- arr[Character.getNumericValue(str.charAt(i))-10]++;
- }
- }
- }
- double sum = 0;
- for (int i = 0; i < arr.length; i++) {
- sum += arr[i];
- }
- for (int i = 0; i < arr.length; i++) {
- arr[i] *= 100/sum;
- }
- return arr;
- }
- // Problem 38. Count (recursively) the number of occurrences of str1 in str2.
- public int countOccurr(String str1, String str2)
- {
- if (str1.length() > str2.length()) {
- return 0;
- }
- else if (str2.substring(0,str1.length()).equals(str1)) {
- return 1 + countOccurr(str1, str2.substring(1));
- }
- else return countOccurr(str1, str2.substring(1));
- }
- // Problem 39. Create ntuples of bigrams (pairs of characters)
- // sorted by percent.
- public List<Ntuple> bigramPercentSortedNtuples()
- {
- ArrayList<Ntuple> ntuples = new ArrayList<Ntuple>();
- List<String> bigrams = new ArrayList<String>();
- // insert your code here
- }
- // Problem 40. Create array of percentages of word lengths. The value at each
- // index i is the percentage of words of length i.
- public double[] wordLengthPercentArray(int maxLen)
- {
- double[] arr = new double[maxLen+1];
- for (String str : plaintextWords) {
- arr[str.length()]++;
- }
- double sum = 0;
- for (int i = 0; i < arr.length; i++) {
- sum += arr[i];
- }
- for (int i = 0; i < arr.length; i++) {
- arr[i] *= 100/sum;
- }
- return arr;
- }
- // Problem 41. Create lexicon report.
- public void lexiconReport(String wordInLexicon)
- {
- System.out.println("\nLexicon Data: Alphabet, Words, EquivClasses & AnigramClasses");
- System.out.println("getAlphabet() = " + getAlphabet());
- System.out.println("getAlphaSize() = " + getAlphaSize());
- System.out.println("getPlaintextWords() = " + getPlaintextWords().subList(0, 50));
- System.out.println("getPlaintextWords().size() = " + getPlaintextWords().size());
- System.out.println("\ncreateNtupleFromNtupleStr = " + createNtupleFromNtupleStr("Ntuple(post,[stoops, opts, post, stop, stoop, spot, spots, tops, pots, stops, posts],11)"));
- System.out.println("getEquivClassesSorted().subList(0, 50) = " + getEquivClassesSorted().subList(0, 50));
- System.out.println("getNumEquivClasses() = " + getNumEquivClasses());
- System.out.println("\nwordInLexicon = " + wordInLexicon);
- System.out.println("getEquivClass(wordInLexicon) = " + getEquivClass(wordInLexicon));
- System.out.println("getEquivClassSize(wordInLexicon) = " + getEquivClassSize(wordInLexicon));
- System.out.println("getEquivClassMaxSize = " + getEquivClassMaxSize());
- System.out.println("\ngetAnagramClassesSorted().subList(0, 50) = " + getAnagramClassesSorted().subList(0, 50));
- System.out.println("getNumAnagramClasses() = " + getNumAnagramClasses());
- System.out.println("\nwordInLexicon = " + wordInLexicon);
- System.out.println("getAnagramClass(wordInLexicon) = " + getAnagramClass(wordInLexicon));
- System.out.println("getAnagramClassSize(wordInLexicon) = " + getAnagramClassSize(wordInLexicon));
- System.out.println("getAnagramClassMaxSize = " + getAnagramClassMaxSize());
- System.out.println("\nwordInLexicon = " + wordInLexicon);
- System.out.println("getEquivClass(wordInLexicon) = " + getEquivClass(wordInLexicon));
- System.out.println("anagramClasses(getEquivClass(wordInLexicon)) = " + anagramClasses(getEquivClass(wordInLexicon)));
- System.out.println("getAnagramClass(wordInLexicon) = " + getAnagramClass(wordInLexicon));
- System.out.println("equivClasses(getAnagramClass(wordInLexicon)) = " + anagramClasses(getEquivClass(wordInLexicon)));
- // System.out.println("equivClasses(getAnagramClass(wordInLexicon)) = " + equivClasses(getAnagramClass(wordInLexicon)));
- System.out.println("\nLexicon Data: Character Counts & Percentages, Bigram Percentages and Word length Percentages");
- System.out.println("getAlphabet() = " + getAlphabet());
- System.out.println("wordInLexicon = " + wordInLexicon);
- System.out.println("charCountArray(wordInLexicon) = " + Arrays.toString(charCountArray(wordInLexicon)));
- System.out.println("charCountArrayTotal() = " + Arrays.toString(charCountArrayTotal()));
- System.out.println("charPercentArray() = " + Arrays.toString(charPercentArray()));
- // List<Ntuple> bigramPercentSortedNtuples = bigramPercentSortedNtuples();
- // System.out.println("\nbigramPercentSortedNtuples() = " + bigramPercentSortedNtuples);
- // System.out.println("bigramPercentSortedNtuples().size() = " + bigramPercentSortedNtuples.size() +" (with nonzero percent out of " + getAlphaSize()*getAlphaSize() + " bigrams)");
- System.out.println("wordLengthPercentArray(27) = " + Arrays.toString(wordLengthPercentArray(27)));
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement