UriSteiff

fixed (without static)

May 12th, 2021
509
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. package il.ac.tau.cs.sw1.ex5;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.FileReader;
  5. import java.io.FileWriter;
  6. import java.io.IOException;
  7. import java.io.PrintWriter;
  8.  
  9. public class BigramModel {
  10.     public static final int MAX_VOCABULARY_SIZE = 14500;
  11.     public static final String VOC_FILE_SUFFIX = ".voc";
  12.     public static final String COUNTS_FILE_SUFFIX = ".counts";
  13.     public static final String SOME_NUM = "some_num";
  14.     public static final int ELEMENT_NOT_FOUND = -1;
  15.    
  16.     String[] mVocabulary;
  17.     int[][] mBigramCounts;
  18.    
  19.     // DO NOT CHANGE THIS !!!
  20.     public void initModel(String fileName) throws IOException{
  21.         mVocabulary = buildVocabularyIndex(fileName);
  22.         mBigramCounts = buildCountsArray(fileName, mVocabulary);
  23.        
  24.     }
  25.    
  26. //  public static boolean isLegalWord(String word) {
  27. //      char[] chars = word.toCharArray(); // convert word to array of characters
  28. //      int digitsCounter = 0; // counts digits in word
  29. //      for (char ch:chars) {
  30. //          if (Character.isLetter(ch)) {
  31. //              return true;
  32. //          }
  33. //          else if (Character.isDigit(ch)) {
  34. //              digitsCounter += 1;
  35. //          }
  36. //      }
  37. //      return (digitsCounter == chars.length); // only digits
  38. //     
  39. //  }
  40.    
  41.     public static boolean wordWithLetter(String word) {
  42.         char[] chars = word.toCharArray();
  43.         for (char ch: chars) {
  44.             if (Character.isLetter(ch)) {
  45.                 return true;
  46.             }
  47.         }
  48.         return false;
  49.     }
  50.    
  51.     public static boolean isNumber(String word) {
  52.         char[] chars = word.toCharArray();
  53.         for (char ch: chars) {
  54.             if (!Character.isDigit(ch)) {
  55.                 return false;
  56.             }
  57.         }
  58.         return true;
  59.     }
  60.    
  61.     public static boolean isLegalWord(String word) {
  62.         return (wordWithLetter(word)||isNumber(word));
  63.     }
  64.    
  65.     public static int getIndex(String[] vocabulary, String word) {
  66.         int i = 0;
  67.         while (!vocabulary[i].equals(word) && i < vocabulary.length - 1) {
  68.             i += 1;
  69.         }
  70.        
  71.         if (vocabulary[i].equals(word)) {
  72.             return i;
  73.         }
  74.         else {
  75.             return -1;
  76.         }
  77.     }
  78.    
  79.     public static void deleteDuplicates(String word, String[] arr) {
  80.         int count = 0;
  81.         for (int i = 0; i < arr.length; i++) {
  82.             if (arr[i].equals(word)) {
  83.                 count += 1;
  84.                 if (count > 1) {
  85.                     arr[i] = "#"; // denote deletion
  86.                 }
  87.             }
  88.         }
  89.     }
  90.    
  91.    
  92.     /*
  93.      * @post: mVocabulary = prev(mVocabulary)
  94.      * @post: mBigramCounts = prev(mBigramCounts)
  95.      */
  96.     public String[] buildVocabularyIndex(String fileName) throws IOException{ // Q 1
  97.        
  98.         FileReader fr = new FileReader(fileName);
  99.         BufferedReader br = new BufferedReader(fr);
  100.         String text = "";
  101.         String new_line;
  102.         while ((new_line = br.readLine()) != null) {
  103.             String[] words = new_line.split(" ");
  104.             for (String word: words) {
  105.                 if (wordWithLetter(word)) {
  106.                     text += word.toLowerCase() + " ";
  107.                 }
  108.                 if (isNumber(word)) {
  109.                     text += SOME_NUM + " ";
  110.                 }
  111.             }
  112.         }
  113.        
  114.         String[] legals = text.split(" ");
  115.        
  116.         int countUniqueLegalWords = 0;
  117.        
  118.         for (String word: legals) {
  119.             if (!word.equals("#")) {
  120.                 countUniqueLegalWords += 1;
  121.                 deleteDuplicates(word, legals);
  122.             }
  123.         }
  124.        
  125.         int length = MAX_VOCABULARY_SIZE;
  126.         if (countUniqueLegalWords < length) {
  127.             length = countUniqueLegalWords;
  128.         }
  129.        
  130.         String[] vocabulary = new String[length];
  131.        
  132.         int index_to_add = 0;
  133.         int i = 0;
  134.         while (i < legals.length && index_to_add < vocabulary.length) {
  135.             if (!legals[i].equals("#")) {
  136.                 vocabulary[index_to_add] = legals[i];
  137.                 index_to_add += 1;
  138.             }
  139.             i += 1;
  140.         }
  141.         return vocabulary;
  142.        
  143.         }  
  144.  
  145.    
  146.    
  147.    
  148.     /*
  149.      * @post: mVocabulary = prev(mVocabulary)
  150.      * @post: mBigramCounts = prev(mBigramCounts)
  151.      */
  152.     public int[][] buildCountsArray(String fileName, String[] vocabulary) throws IOException{ // Q - 2
  153.        
  154.         int[][] counts = new int[vocabulary.length][vocabulary.length];
  155.         FileReader fr = new FileReader(fileName);
  156.         BufferedReader br = new BufferedReader(fr);
  157.         String line;
  158.         while ((line = br.readLine()) != null) {
  159.             String[] sentence = line.split(" "); // words in line
  160.             for (int k = 0; k < sentence.length - 1; k++) {
  161.                 if (wordWithLetter(sentence[k]) && wordWithLetter(sentence[k+1])) {
  162.                     int i = getIndex(vocabulary, sentence[k].toLowerCase());
  163.                     int j = getIndex(vocabulary, sentence[k+1].toLowerCase());
  164.                     counts[i][j] += 1;
  165.                 }
  166.                 else if (isNumber(sentence[k]) && isNumber(sentence[k+1])) {
  167.                     int i = getIndex(vocabulary, SOME_NUM);
  168.                     int j = getIndex(vocabulary, SOME_NUM);
  169.                     counts[i][j] += 1;
  170.                 }
  171.             }  
  172.         }
  173.         return counts;
  174.  
  175.     }
  176.    
  177.    
  178.     /*
  179.      * @pre: the method initModel was called (the language model is initialized)
  180.      * @pre: fileName is a legal file path
  181.      */
  182.     public void saveModel(String fileName) throws IOException{ // Q-3
  183.         String vocabFile = fileName + VOC_FILE_SUFFIX;
  184.         String countsFile = fileName + COUNTS_FILE_SUFFIX;
  185.         PrintWriter writer1 = null;
  186.         PrintWriter writer2 = null;
  187.         FileWriter vfw = new FileWriter(vocabFile, true);
  188.         FileWriter cfw = new FileWriter(countsFile, true);
  189.         writer1 = new PrintWriter(vfw);
  190.         writer2 = new PrintWriter(cfw);
  191.         writer1.println(mVocabulary.length + " words");
  192.         for (int i = 0; i < mVocabulary.length; i++) {
  193.             writer1.println(i + "," + mVocabulary[i]);
  194.         }
  195.         writer1.close();
  196.        
  197.         for (int i = 0; i < mBigramCounts.length; i++) {
  198.             for (int j = 0; j < mBigramCounts[0].length; j++) {
  199.                 if (mBigramCounts[i][j] > 0) {
  200.                     writer2.println(i + "," + j + ":" + mBigramCounts[i][j]);
  201.                 }
  202.             }
  203.         }
  204.         writer2.close();
  205.        
  206.     }
  207.    
  208.    
  209.    
  210.     /*
  211.      * @pre: fileName is a legal file path
  212.      */
  213.     public void loadModel(String fileName) throws IOException{ // Q - 4
  214.        
  215.         // vocabulary
  216.         FileReader vocFr = new FileReader(fileName + VOC_FILE_SUFFIX);
  217.         BufferedReader vocBr = new BufferedReader(vocFr);
  218.         String firstLine = vocBr.readLine();
  219.         String[] firstLineArr = firstLine.split(" " );
  220.         int numberOfWords = Integer.parseInt(firstLineArr[0]);
  221.        
  222.         String[] vocabulary = new String[numberOfWords];
  223.        
  224.         String newLine;
  225.        
  226.         while ((newLine = vocBr.readLine()) != null) {
  227.             String[] newLineArr = newLine.split(",");
  228.             int i = Integer.parseInt(newLineArr[0]);
  229.             String word = newLineArr[1];
  230.             vocabulary[i] = word;
  231.         }
  232.        
  233.         // bigram counts
  234.         int[][] counts = new int[vocabulary.length][vocabulary.length];
  235.         FileReader countsFr = new FileReader(fileName + COUNTS_FILE_SUFFIX);
  236.         BufferedReader countsBr = new BufferedReader(countsFr);
  237.         String line;
  238.         while ((line = countsBr.readLine()) != null) {
  239.             String[] lineParts = line.split(":");
  240.             String part1 = lineParts[0];
  241.             int value = Integer.parseInt(lineParts[1]);
  242.             String[] indices = part1.split(",");
  243.             int i_index = Integer.parseInt(indices[0]);
  244.             int j_index = Integer.parseInt(indices[1]);
  245.             counts[i_index][j_index] = value;
  246.         }
  247.        
  248.         mVocabulary = vocabulary;
  249.         mBigramCounts = counts;
  250.     }
  251.  
  252.    
  253.    
  254.     /*
  255.      * @pre: word is in lowercase
  256.      * @pre: the method initModel was called (the language model is initialized)
  257.      * @pre: word is in lowercase
  258.      * @post: $ret = -1 if word is not in vocabulary, otherwise $ret = the index of word in vocabulary
  259.      */
  260.     public int getWordIndex(String word){  // Q - 5
  261.         return getIndex(mVocabulary, word);
  262.     }
  263.    
  264.    
  265.    
  266.     /*
  267.      * @pre: word1, word2 are in lowercase
  268.      * @pre: the method initModel was called (the language model is initialized)
  269.      * @post: $ret = the count for the bigram <word1, word2>. if one of the words does not
  270.      * exist in the vocabulary, $ret = 0
  271.      */
  272.     public int getBigramCount(String word1, String word2){ //  Q - 6
  273.         int findResult1 = getWordIndex(word1);
  274.         int findResult2 = getWordIndex(word2);
  275.         if (findResult1 == -1 || findResult2 == -1) {
  276.             return 0;
  277.         }
  278.         else { // both words in vocabulary
  279.             return mBigramCounts[findResult1][findResult2];
  280.         }
  281.     }
  282.    
  283.    
  284.     /*
  285.      * @pre word in lowercase, and is in mVocabulary
  286.      * @pre: the method initModel was called (the language model is initialized)
  287.      * @post $ret = the word with the lowest vocabulary index that appears most fequently after word (if a bigram starting with
  288.      * word was never seen, $ret will be null
  289.      */
  290.     public String getMostFrequentProceeding(String word){ //  Q - 7
  291.         int i = getWordIndex(word); // index in vocabulary
  292.         int maxTimesProc = 0;
  293.         String mostFrequent = "";
  294.         for (int j = 0; j < mBigramCounts[i].length; j++) {
  295.             if (mBigramCounts[i][j] > maxTimesProc) {
  296.                 mostFrequent = mVocabulary[j]; // word
  297.                 maxTimesProc = mBigramCounts[i][j]; // number of times
  298.             }
  299.         }
  300.         if (maxTimesProc == 0) {
  301.             return null;
  302.         }
  303.         else {
  304.             return mostFrequent;
  305.         }
  306.     }
  307.    
  308.    
  309.     /* @pre: sentence is in lowercase
  310.      * @pre: the method initModel was called (the language model is initialized)
  311.      * @pre: each two words in the sentence are are separated with a single space
  312.      * @post: if sentence is is probable, according to the model, $ret = true, else, $ret = false
  313.      */
  314.     public boolean isLegalSentence(String sentence){  //  Q - 8
  315.         String[] sentenceWords = sentence.split(" "); // words in sentence
  316.         if (sentenceWords.length == 0) { // empty sentence
  317.             return true;
  318.         }
  319.         else if (sentenceWords.length == 1) { // one word sentence
  320.             String word = sentenceWords[0];
  321.             if (getWordIndex(word) == -1) {
  322.                 return false;
  323.             }
  324.             else {
  325.                 return true;
  326.             }
  327.         }
  328.         else { // more than one word
  329.             for (int i = 0; i < sentenceWords.length - 1; i++) {
  330.                 String word1 = sentenceWords[i]; // first word in pair
  331.                 String word2 = sentenceWords[i+1]; // second word in pair
  332.                 int word1_index = getWordIndex(word1);
  333.                 int word2_index = getWordIndex(word2);
  334.                 if (word1_index == -1 || word2_index == -1) {
  335.                     return false;
  336.                 }
  337.                 else { // both words in vocabulary
  338.                     if (mBigramCounts[word1_index][word2_index]  == 0) {
  339.                         return false;
  340.                     }
  341.                 }
  342.             }
  343.             return true;
  344.         }
  345.     }
  346.    
  347.     public static boolean onlyZeros(int[] arr) {
  348.         for (int num:arr) {
  349.             if (num != 0) {
  350.                 return false;
  351.             }
  352.         }
  353.         return true;
  354.     }
  355.    
  356.     /*
  357.      * @pre: arr1.length = arr2.legnth
  358.      * post if arr1 or arr2 are only filled with zeros, $ret = -1, otherwise calcluates CosineSim
  359.      */
  360.     public static double calcCosineSim(int[] arr1, int[] arr2){ //  Q - 9
  361.         if (onlyZeros(arr1) || onlyZeros(arr2)) {
  362.             return -1.;
  363.         }
  364.         else {
  365.             // numerator
  366.             double numerator = 0;
  367.             for (int i = 0; i < arr1.length; i++) {
  368.                 numerator += arr1[i] * arr2[i];
  369.             }
  370.            
  371.             // denominator
  372.             double sumSquares1 = 0;
  373.             for (int number:arr1) {
  374.                 sumSquares1 += number * number;
  375.             }
  376.             double sumSquares2 = 0;
  377.             for (int number:arr2) {
  378.                 sumSquares2 += number * number;
  379.             }
  380.             double denominator = Math.sqrt(sumSquares1) * Math.sqrt(sumSquares2);
  381.             return numerator / denominator;
  382.         }
  383.     }
  384.  
  385.    
  386.     /*
  387.      * @pre: word is in vocabulary
  388.      * @pre: the method initModel was called (the language model is initialized),
  389.      * @post: $ret = w implies that w is the word with the largest cosineSimilarity(vector for word, vector for w) among all the
  390.      * other words in vocabulary
  391.      */
  392.     public String getClosestWord(String word){ //  Q - 10
  393.        
  394.         int i = getWordIndex(word); // index of word in vocabulary
  395.         double factor;
  396.         String closestWord;
  397.        
  398.         if (mVocabulary.length == 1) {
  399.             return mVocabulary[0];
  400.         }
  401.         else if (i == 0) { // word is first in vocabulary
  402.             closestWord = mVocabulary[1];
  403.             factor = calcCosineSim(mBigramCounts[i], mBigramCounts[1]);
  404.         }
  405.         else {
  406.             closestWord = mVocabulary[0];
  407.             factor = calcCosineSim(mBigramCounts[i], mBigramCounts[0]);
  408.         }
  409.        
  410.         int[] wordVector = mBigramCounts[i]; // vector of word
  411.         for (int j = 0; j < mBigramCounts.length; j++) {
  412.             if (j != i) {
  413.                 String word2 = mVocabulary[j];
  414.                 int[] word2Vector = mBigramCounts[j]; // vector of second word
  415.                 double cos = calcCosineSim(wordVector, word2Vector);
  416.                 if (cos > factor) {
  417.                     closestWord = word2;
  418.                     factor = cos;
  419.                 }
  420.             }
  421.         }
  422.         return closestWord;
  423.     }
  424.    
  425.  
  426.        
  427.        
  428.     }
  429.  
  430.  
RAW Paste Data