Advertisement
Guest User

Untitled

a guest
Nov 20th, 2019
118
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.48 KB | None | 0 0
  1. import java.util.*;
  2.  
  3. // author: j.lines@uea.ac.uk
  4.  
  5. public class CourseworkUtilities {
  6.  
  7. public static Random rand = new Random(); // seed this if you want reproducible results
  8.  
  9. // example usage creating a dictionary and a document
  10. public static void main(String[] args) throws Exception {
  11. for (int j = 0; j < 1; j++) {
  12.  
  13. int numDocsMultiArray = 5;
  14.  
  15. int numWordsInDoc = 10;
  16. int numWordsDict = 5;
  17. int wordLength = 2;
  18.  
  19.  
  20. int dsd = 0;
  21.  
  22. // generate the dictionary
  23. String[] dict = generateDictionary(numWordsDict, wordLength);
  24. int [] nearestDocuments = new int[dict.length];
  25. /* printing dictionary
  26. System.out.println("dictionary:");
  27. for (int i = 0; i < dict.length; i++) {
  28. //System.out.print(dict[i]+" ");
  29. }
  30. System.out.println();
  31. */
  32. // generating document by passing in the dictionary and a total number
  33. // of words for the document. The document will then be generated using
  34. // unseen words and some words from the dictionary. These are sampled using
  35. // the Random object on line 6 - seed this if you would like results to be
  36. // reproducible
  37. String[] doc = generateDocument(dict, numWordsInDoc);
  38. String[] doc1 = generateDocument(dict, numWordsInDoc);
  39.  
  40. /*printing document
  41. System.out.println("\nDocument:");
  42. for (int i = 0; i < doc.length; i++) {
  43. //System.out.print(doc[i]+" ");
  44. }
  45. System.out.println();
  46. */
  47. long startTime = System.nanoTime(); //gets start time of methods execution time
  48. int[] featureVector = calculateFeatureVector(dict, doc);
  49. int[] featureVector1 = calculateFeatureVector(dict, doc1);
  50. long endTime = System.nanoTime(); //gets end time of methods execution time
  51. long duration = (endTime - startTime); //calculate total duration in milliseconds
  52.  
  53. //System.out.println("\nFeature Vector:");
  54. //for (int i = 0; i < featureVector.length; i++) {
  55. // System.out.print(featureVector[i] + " ");
  56. //}
  57.  
  58. //System.out.println(duration);
  59.  
  60. String[][] twoDimenArrDoc = generateTwoDimenArrDoc(dict, numWordsInDoc, numDocsMultiArray);
  61.  
  62. dsd = documentSimilarityDistance(featureVector, featureVector1);
  63.  
  64. nearestDocuments = findNearestDocuments(twoDimenArrDoc, dict);
  65.  
  66. System.out.print(Arrays.toString(featureVector) + Arrays.toString(featureVector1) + "DSD = " + dsd + "\n");
  67.  
  68.  
  69. for (int i = 0; i < nearestDocuments.length; i++) {
  70. System.out.println("Nearest documents to " + i + " is " + nearestDocuments[i]);
  71. }
  72. //int[] dsdArr = new int[twoDimenArrDoc.length];
  73.  
  74. //dsdArr = findNearestDocuments(twoDimenArrDoc, dict);
  75.  
  76. //System.out.println(dsdArr);
  77.  
  78.  
  79. }
  80. }
  81.  
  82. // generate a word of a given length by randomly generating letters
  83. public static String generateWord(int wordLength) {
  84. StringBuilder st = new StringBuilder();
  85. for (int i = 0; i < wordLength; i++) {
  86. st.append((char) (rand.nextInt(26) + 'a'));
  87. }
  88. return st.toString();
  89. }
  90.  
  91. // generate a dictionary for a given word length and number of words to generate. Note that duplicates are not
  92. // allowed so this method checks before adding a new word. HOWEVER, as noted, this is a crude implementation and
  93. // not the most efficient. This is fine for now however and you'll learn better ways later on.
  94. public static String[] generateDictionary(int numWords, int wordLength) throws Exception {
  95.  
  96. if (Math.pow(26, wordLength) < numWords) {
  97. throw new Exception("Error: the input arguments could only result in "
  98. + "26^" + wordLength + " (" + ((int) (Math.pow(26, wordLength))) + ") distinct words but the"
  99. + " numWords argument is set to " + numWords);
  100. }
  101.  
  102. // remember - DO NOT USE IN-BUILT JAVA DATA STRUCTURES IN YOUR OWN CODE FOR THIS ASSIGNMENT (you can still
  103. // use arrays wherever you like, however)
  104. //
  105. // It is fine to use ArrayList here as this has been given to you but do not use it
  106. // anywhere else in your coursework.
  107.  
  108. ArrayList<String> dictionary = new ArrayList<>(numWords);
  109. String temp;
  110. while (dictionary.size() < numWords) {
  111. temp = generateWord(wordLength);
  112. if (!dictionary.contains(temp)) {
  113. dictionary.add(temp);
  114. }
  115. }
  116. return dictionary.toArray(new String[dictionary.size()]);
  117. }
  118.  
  119. // similar to generating a dictionary but simpler - generate a given number of random words of a specified length.
  120. // No need to check for duplicates here - this method just fills up your document with other words for
  121. // testing/timing but it doesn't matter what they are
  122. public static String[] generateFillerWords(int numWords, int wordLength) {
  123.  
  124. String[] output = new String[numWords];
  125. for (int i = 0; i < numWords; i++) {
  126. output[i] = generateWord(wordLength);
  127. }
  128. return output;
  129. }
  130.  
  131. // uses all of the above to generate a document when passed a dictionary. Randomly samples with a uniform
  132. // distribution (i.e. each word is as likely to be picked as any other) so for very large documents you should
  133. // expect similar counts of each word
  134. public static String[] generateDocument(String[] dictionary, int numWordsInDoc) {
  135. // generate other words to fill the document with
  136. String[] otherWords = generateFillerWords(dictionary.length * 2, dictionary[0].length());
  137.  
  138. String[] documentList = new String[numWordsInDoc];
  139.  
  140. int nextWordIdx;
  141. int numDistinctWords = dictionary.length * 3;
  142.  
  143. StringBuilder st = new StringBuilder();
  144. for (int i = 0; i < numWordsInDoc; i++) {
  145. nextWordIdx = rand.nextInt(numDistinctWords);
  146. if (nextWordIdx < dictionary.length) {
  147. documentList[i] = dictionary[nextWordIdx];
  148. } else {
  149. documentList[i] = otherWords[nextWordIdx - dictionary.length];
  150. }
  151. }
  152. return documentList;
  153. }
  154.  
  155. public static int[] calculateFeatureVector(String dict[], String doc[]) {
  156.  
  157. int fVector[] = new int[dict.length];
  158.  
  159. for (int i = 0; i < dict.length; i++) {
  160. for (int j = 0; j < doc.length; j++) {
  161. if (dict[i].equals(doc[j])) {
  162. fVector[i] = fVector[i] + 1; //change to j
  163. }
  164. }
  165. }
  166. return fVector;
  167. }
  168.  
  169. public static String[][] generateTwoDimenArrDoc(String dict[], int numWordsDoc, int numDocs) {
  170.  
  171. String doc[] = generateDocument(dict, numWordsDoc);
  172.  
  173. String twoDimenArrDoc[][] = new String[numDocs][numWordsDoc];
  174.  
  175. for (int i = 0; i < numDocs; i++) {
  176. twoDimenArrDoc[i] = generateDocument(dict, numWordsDoc);
  177. }
  178. return twoDimenArrDoc;
  179. }
  180.  
  181. public static int[] findNearestDocuments(String twoDimenArrDoc[][], String dict[]) {
  182.  
  183. int docDSD[] = new int[twoDimenArrDoc.length]; //array to hold DSD of two documents to compare
  184. int indexValues[] = new int[twoDimenArrDoc.length]; //array to hold indexs of the smallest DSD
  185.  
  186. Arrays.fill(docDSD, Integer.MAX_VALUE); //fill DSD array of max int's ready for comparison
  187.  
  188. int[][] twoDimenArrFeatureVectors = new int[twoDimenArrDoc.length][]; //intialize 2d array to store feature vectors of 2 docs
  189.  
  190.  
  191. //populate all features vectors
  192. for (int i = 0; i < twoDimenArrDoc.length; i++) {
  193. twoDimenArrFeatureVectors[i] = calculateFeatureVector(dict, twoDimenArrDoc[i]); //populate 2d array with feature vectors of all documents
  194. }
  195.  
  196. for (int i = 0; i < twoDimenArrFeatureVectors.length; i++) {
  197. for (int j = 0; j < twoDimenArrFeatureVectors.length; j++) {
  198. if (i != j) { //ensure the dsd is not compared with its self
  199. int distance = documentSimilarityDistance(twoDimenArrFeatureVectors[i], twoDimenArrFeatureVectors[j]); // calculate the DSD of doc[i] with doc[j]
  200. if (distance < docDSD[i]) { //compare the DSD of doc[i] & doc[j] to the current smallest DSD in docDSD[i]
  201. docDSD[i] = distance; //if smaller insert the DSD into docDSD[i]
  202. indexValues[i] = j; //record the index of the smallest doc[j] DSD in indexValues[i]
  203. }
  204. }
  205. }
  206. }
  207. return indexValues;
  208. }
  209.  
  210. public static int documentSimilarityDistance(int fVectorOne[], int fVectorTwo[]) {
  211. int dsdSum = 0;
  212. int dsd;
  213. for (int i = 0; i < fVectorOne.length; i++) {
  214. dsd = fVectorOne[i] - fVectorTwo[i];
  215. if (dsd < 0) {
  216. dsd *= -1;
  217. }
  218. dsdSum = dsdSum + dsd;
  219. }
  220. return dsdSum;
  221. }
  222. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement