Advertisement
Guest User

Untitled

a guest
Nov 20th, 2019
150
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.33 KB | None | 0 0
  1. import java.util.*;
  2.  
  3. // author: j.lines@uea.ac.uk
  4.  
  5. public class CourseworkUtilities {
  6.  
  7. public static Random rand = new Random(); // seed this if you want reproducible results
  8.  
  9. // example usage creating a dictionary and a document
  10. public static void main(String[] args) throws Exception{
  11. for (int j = 0; j < 1; j++) {
  12.  
  13. int numDocsMultiArray = 5;
  14.  
  15. int numWordsInDoc = 10;
  16. int numWordsDict = 5;
  17. int wordLength = 2;
  18.  
  19.  
  20. int dsd = 0;
  21.  
  22. // generate the dictionary
  23. String[] dict = generateDictionary(numWordsDict, wordLength);
  24.  
  25. /* printing dictionary
  26. System.out.println("dictionary:");
  27. for (int i = 0; i < dict.length; i++) {
  28. //System.out.print(dict[i]+" ");
  29. }
  30. System.out.println();
  31. */
  32. // generating document by passing in the dictionary and a total number
  33. // of words for the document. The document will then be generated using
  34. // unseen words and some words from the dictionary. These are sampled using
  35. // the Random object on line 6 - seed this if you would like results to be
  36. // reproducible
  37. String[] doc = generateDocument(dict, numWordsInDoc);
  38. String[] doc1 = generateDocument(dict, numWordsInDoc);
  39.  
  40. /*printing document
  41. System.out.println("\nDocument:");
  42. for (int i = 0; i < doc.length; i++) {
  43. //System.out.print(doc[i]+" ");
  44. }
  45. System.out.println();
  46. */
  47. long startTime = System.nanoTime(); //gets start time of methods execution time
  48. int[] featureVector = calculateFeatureVector(dict, doc);
  49. int[] featureVector1 = calculateFeatureVector(dict, doc1);
  50. long endTime = System.nanoTime(); //gets end time of methods execution time
  51. long duration = (endTime - startTime); //calculate total duration in milliseconds
  52.  
  53. //System.out.println("\nFeature Vector:");
  54. //for (int i = 0; i < featureVector.length; i++) {
  55. // System.out.print(featureVector[i] + " ");
  56. //}
  57.  
  58. //System.out.println(duration);
  59.  
  60. String[][] twoDimenArrDoc = generateTwoDimenArrDoc(dict, numWordsInDoc, numDocsMultiArray);
  61.  
  62. int [] testArray = new int[dict.length];
  63.  
  64. dsd = documentSimilarityDistance(featureVector, featureVector1);
  65.  
  66. findNearestDocuments(twoDimenArrDoc, dict);
  67.  
  68. System.out.print(Arrays.toString(featureVector) + Arrays.toString(featureVector1) + "DSD = " + dsd + "\n");
  69.  
  70.  
  71. //int[] dsdArr = new int[twoDimenArrDoc.length];
  72.  
  73. //dsdArr = findNearestDocuments(twoDimenArrDoc, dict);
  74.  
  75. //System.out.println(dsdArr);
  76.  
  77.  
  78. }
  79. }
  80.  
  81. // generate a word of a given length by randomly generating letters
  82. public static String generateWord(int wordLength){
  83. StringBuilder st = new StringBuilder();
  84. for(int i =0; i < wordLength; i++){
  85. st.append((char)(rand.nextInt(26)+'a'));
  86. }
  87. return st.toString();
  88. }
  89.  
  90. // generate a dictionary for a given word length and number of words to generate. Note that duplicates are not
  91. // allowed so this method checks before adding a new word. HOWEVER, as noted, this is a crude implementation and
  92. // not the most efficient. This is fine for now however and you'll learn better ways later on.
  93. public static String[] generateDictionary(int numWords, int wordLength) throws Exception{
  94.  
  95. if(Math.pow(26,wordLength) < numWords){
  96. throw new Exception("Error: the input arguments could only result in "
  97. +"26^"+wordLength+" ("+((int)(Math.pow(26,wordLength)))+ ") distinct words but the"
  98. +" numWords argument is set to "+numWords);
  99. }
  100.  
  101. // remember - DO NOT USE IN-BUILT JAVA DATA STRUCTURES IN YOUR OWN CODE FOR THIS ASSIGNMENT (you can still
  102. // use arrays wherever you like, however)
  103. //
  104. // It is fine to use ArrayList here as this has been given to you but do not use it
  105. // anywhere else in your coursework.
  106.  
  107. ArrayList<String> dictionary = new ArrayList<>(numWords);
  108. String temp;
  109. while(dictionary.size() < numWords){
  110. temp = generateWord(wordLength);
  111. if(!dictionary.contains(temp)){
  112. dictionary.add(temp);
  113. }
  114. }
  115. return dictionary.toArray(new String[dictionary.size()]);
  116. }
  117.  
  118. // similar to generating a dictionary but simpler - generate a given number of random words of a specified length.
  119. // No need to check for duplicates here - this method just fills up your document with other words for
  120. // testing/timing but it doesn't matter what they are
  121. public static String[] generateFillerWords(int numWords, int wordLength){
  122.  
  123. String[] output = new String[numWords];
  124. for(int i = 0; i < numWords; i++){
  125. output[i] = generateWord(wordLength);
  126. }
  127. return output;
  128. }
  129.  
  130. // uses all of the above to generate a document when passed a dictionary. Randomly samples with a uniform
  131. // distribution (i.e. each word is as likely to be picked as any other) so for very large documents you should
  132. // expect similar counts of each word
  133. public static String[] generateDocument(String[] dictionary, int numWordsInDoc){
  134. // generate other words to fill the document with
  135. String[] otherWords = generateFillerWords(dictionary.length*2,dictionary[0].length());
  136.  
  137. String[] documentList = new String[numWordsInDoc];
  138.  
  139. int nextWordIdx;
  140. int numDistinctWords = dictionary.length*3;
  141.  
  142. StringBuilder st = new StringBuilder();
  143. for(int i = 0; i < numWordsInDoc;i++){
  144. nextWordIdx = rand.nextInt(numDistinctWords);
  145. if(nextWordIdx < dictionary.length) {
  146. documentList[i] = dictionary[nextWordIdx];
  147. }else{
  148. documentList[i] = otherWords[nextWordIdx-dictionary.length];
  149. }
  150. }
  151. return documentList;
  152. }
  153.  
  154. public static int[] calculateFeatureVector(String dict[], String doc[]){
  155.  
  156. int fVector[] = new int [dict.length];
  157.  
  158. for (int i = 0; i < dict.length; i++) {
  159. for (int j = 0; j < doc.length; j++) {
  160. if (dict[i].equals(doc[j])){
  161. fVector[i] = fVector[i] + 1; //change to j
  162. }
  163. }
  164. }
  165. return fVector;
  166. }
  167.  
  168. public static String[][] generateTwoDimenArrDoc(String dict[], int numWordsDoc, int numDocs){
  169.  
  170. String doc[] = generateDocument(dict, numWordsDoc);
  171.  
  172. String twoDimenArrDoc [][] = new String[numDocs][numWordsDoc];
  173.  
  174. for (int i = 0; i < numDocs; i++) {
  175. twoDimenArrDoc[i] = generateDocument(dict, numWordsDoc);
  176. }
  177. return twoDimenArrDoc;
  178. }
  179.  
  180. public static void findNearestDocuments(String twoDimenArrDoc[][], String dict[]){
  181. int [] featureVector1 = new int [dict.length];
  182. int [] featureVector2 = new int [dict.length];
  183.  
  184. int arrayTwoLength = twoDimenArrDoc[0].length;
  185.  
  186. int[][] twoDimenArrFeatureVectors = new int[twoDimenArrDoc.length][twoDimenArrDoc[0].length];
  187. int [] findNearestDocument = new int[twoDimenArrDoc[0].length];
  188.  
  189. for (int i = 0; i < twoDimenArrDoc.length; i++) {
  190. featureVector1 = calculateFeatureVector(dict, twoDimenArrDoc[i]);
  191. for (int j = 0; j < twoDimenArrDoc[0].length - 1; j++) {
  192. if (j == i){
  193. twoDimenArrFeatureVectors[i][j] = 99999;
  194. }
  195. if (j != i){
  196. featureVector2 = calculateFeatureVector(dict, twoDimenArrDoc[j]);
  197. twoDimenArrFeatureVectors[i][j] = documentSimilarityDistance(featureVector1, featureVector2); //[i] = the index of the first feature vector, [j] = the index of the compared doc
  198. }
  199. }
  200. }
  201.  
  202. for (int i = 0; i < twoDimenArrFeatureVectors.length; i++) {
  203. for (int j = 0; j < twoDimenArrFeatureVectors[0].length; j++) {
  204. if (j + 1 != twoDimenArrFeatureVectors[0].length) {
  205. if (i != j) {
  206. if (twoDimenArrFeatureVectors[i][j] < twoDimenArrFeatureVectors[i][j + 1]) ;
  207. findNearestDocument[i] = twoDimenArrFeatureVectors[i][j];
  208. }
  209. }
  210.  
  211. }
  212. }
  213.  
  214. }
  215.  
  216. public static int documentSimilarityDistance(int fVectorOne[], int fVectorTwo[]){
  217. int dsdSum = 0;
  218. for (int i = 0; i < fVectorOne.length; i++) {
  219. int dsd = 0;
  220. dsd += fVectorOne[i] - fVectorTwo[i];
  221. if (dsd < 0){
  222. dsd *= -1;
  223. }
  224. dsdSum = dsdSum + dsd;
  225. }
  226. return dsdSum;
  227. }
  228.  
  229.  
  230. public static int findIndex(int arr[], int t){
  231.  
  232. if (arr == null){
  233. return -1;
  234. }
  235.  
  236. int len = arr.length;
  237. int i = 0;
  238.  
  239. while (i < len){
  240. if (arr[i] ==t){
  241. return i;
  242. }
  243. else {
  244. i = i + 1;
  245. }
  246.  
  247. }
  248. return -1;
  249. }
  250. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement