Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.util.*;
- // author: j.lines@uea.ac.uk
- public class CourseworkUtilities {
- public static Random rand = new Random(); // seed this if you want reproducible results
- // example usage creating a dictionary and a document
- public static void main(String[] args) throws Exception {
- for (int j = 0; j < 1; j++) {
- int numDocsMultiArray = 5;
- int numWordsInDoc = 10;
- int numWordsDict = 5;
- int wordLength = 2;
- int dsd = 0;
- // generate the dictionary
- String[] dict = generateDictionary(numWordsDict, wordLength);
- int [] nearestDocuments = new int[dict.length];
- /* printing dictionary
- System.out.println("dictionary:");
- for (int i = 0; i < dict.length; i++) {
- //System.out.print(dict[i]+" ");
- }
- System.out.println();
- */
- // generating document by passing in the dictionary and a total number
- // of words for the document. The document will then be generated using
- // unseen words and some words from the dictionary. These are sampled using
- // the Random object on line 6 - seed this if you would like results to be
- // reproducible
- String[] doc = generateDocument(dict, numWordsInDoc);
- String[] doc1 = generateDocument(dict, numWordsInDoc);
- /*printing document
- System.out.println("\nDocument:");
- for (int i = 0; i < doc.length; i++) {
- //System.out.print(doc[i]+" ");
- }
- System.out.println();
- */
- long startTime = System.nanoTime(); //gets start time of methods execution time
- int[] featureVector = calculateFeatureVector(dict, doc);
- int[] featureVector1 = calculateFeatureVector(dict, doc1);
- long endTime = System.nanoTime(); //gets end time of methods execution time
- long duration = (endTime - startTime); //calculate total duration in milliseconds
- //System.out.println("\nFeature Vector:");
- //for (int i = 0; i < featureVector.length; i++) {
- // System.out.print(featureVector[i] + " ");
- //}
- //System.out.println(duration);
- String[][] twoDimenArrDoc = generateTwoDimenArrDoc(dict, numWordsInDoc, numDocsMultiArray);
- dsd = documentSimilarityDistance(featureVector, featureVector1);
- nearestDocuments = findNearestDocuments(twoDimenArrDoc, dict);
- System.out.print(Arrays.toString(featureVector) + Arrays.toString(featureVector1) + "DSD = " + dsd + "\n");
- for (int i = 0; i < nearestDocuments.length; i++) {
- System.out.println("Nearest documents to " + i + " is " + nearestDocuments[i]);
- }
- //int[] dsdArr = new int[twoDimenArrDoc.length];
- //dsdArr = findNearestDocuments(twoDimenArrDoc, dict);
- //System.out.println(dsdArr);
- }
- }
- // generate a word of a given length by randomly generating letters
- public static String generateWord(int wordLength) {
- StringBuilder st = new StringBuilder();
- for (int i = 0; i < wordLength; i++) {
- st.append((char) (rand.nextInt(26) + 'a'));
- }
- return st.toString();
- }
- // generate a dictionary for a given word length and number of words to generate. Note that duplicates are not
- // allowed so this method checks before adding a new word. HOWEVER, as noted, this is a crude implementation and
- // not the most efficient. This is fine for now however and you'll learn better ways later on.
- public static String[] generateDictionary(int numWords, int wordLength) throws Exception {
- if (Math.pow(26, wordLength) < numWords) {
- throw new Exception("Error: the input arguments could only result in "
- + "26^" + wordLength + " (" + ((int) (Math.pow(26, wordLength))) + ") distinct words but the"
- + " numWords argument is set to " + numWords);
- }
- // remember - DO NOT USE IN-BUILT JAVA DATA STRUCTURES IN YOUR OWN CODE FOR THIS ASSIGNMENT (you can still
- // use arrays wherever you like, however)
- //
- // It is fine to use ArrayList here as this has been given to you but do not use it
- // anywhere else in your coursework.
- ArrayList<String> dictionary = new ArrayList<>(numWords);
- String temp;
- while (dictionary.size() < numWords) {
- temp = generateWord(wordLength);
- if (!dictionary.contains(temp)) {
- dictionary.add(temp);
- }
- }
- return dictionary.toArray(new String[dictionary.size()]);
- }
- // similar to generating a dictionary but simpler - generate a given number of random words of a specified length.
- // No need to check for duplicates here - this method just fills up your document with other words for
- // testing/timing but it doesn't matter what they are
- public static String[] generateFillerWords(int numWords, int wordLength) {
- String[] output = new String[numWords];
- for (int i = 0; i < numWords; i++) {
- output[i] = generateWord(wordLength);
- }
- return output;
- }
- // uses all of the above to generate a document when passed a dictionary. Randomly samples with a uniform
- // distribution (i.e. each word is as likely to be picked as any other) so for very large documents you should
- // expect similar counts of each word
- public static String[] generateDocument(String[] dictionary, int numWordsInDoc) {
- // generate other words to fill the document with
- String[] otherWords = generateFillerWords(dictionary.length * 2, dictionary[0].length());
- String[] documentList = new String[numWordsInDoc];
- int nextWordIdx;
- int numDistinctWords = dictionary.length * 3;
- StringBuilder st = new StringBuilder();
- for (int i = 0; i < numWordsInDoc; i++) {
- nextWordIdx = rand.nextInt(numDistinctWords);
- if (nextWordIdx < dictionary.length) {
- documentList[i] = dictionary[nextWordIdx];
- } else {
- documentList[i] = otherWords[nextWordIdx - dictionary.length];
- }
- }
- return documentList;
- }
- public static int[] calculateFeatureVector(String dict[], String doc[]) {
- int fVector[] = new int[dict.length];
- for (int i = 0; i < dict.length; i++) {
- for (int j = 0; j < doc.length; j++) {
- if (dict[i].equals(doc[j])) {
- fVector[i] = fVector[i] + 1; //change to j
- }
- }
- }
- return fVector;
- }
- public static String[][] generateTwoDimenArrDoc(String dict[], int numWordsDoc, int numDocs) {
- String doc[] = generateDocument(dict, numWordsDoc);
- String twoDimenArrDoc[][] = new String[numDocs][numWordsDoc];
- for (int i = 0; i < numDocs; i++) {
- twoDimenArrDoc[i] = generateDocument(dict, numWordsDoc);
- }
- return twoDimenArrDoc;
- }
- public static int[] findNearestDocuments(String twoDimenArrDoc[][], String dict[]) {
- int docDSD[] = new int[twoDimenArrDoc.length]; //array to hold DSD of two documents to compare
- int indexValues[] = new int[twoDimenArrDoc.length]; //array to hold indexs of the smallest DSD
- Arrays.fill(docDSD, Integer.MAX_VALUE); //fill DSD array of max int's ready for comparison
- int[][] twoDimenArrFeatureVectors = new int[twoDimenArrDoc.length][]; //intialize 2d array to store feature vectors of 2 docs
- //populate all features vectors
- for (int i = 0; i < twoDimenArrDoc.length; i++) {
- twoDimenArrFeatureVectors[i] = calculateFeatureVector(dict, twoDimenArrDoc[i]); //populate 2d array with feature vectors of all documents
- }
- for (int i = 0; i < twoDimenArrFeatureVectors.length; i++) {
- for (int j = 0; j < twoDimenArrFeatureVectors.length; j++) {
- if (i != j) { //ensure the dsd is not compared with its self
- int distance = documentSimilarityDistance(twoDimenArrFeatureVectors[i], twoDimenArrFeatureVectors[j]); // calculate the DSD of doc[i] with doc[j]
- if (distance < docDSD[i]) { //compare the DSD of doc[i] & doc[j] to the current smallest DSD in docDSD[i]
- docDSD[i] = distance; //if smaller insert the DSD into docDSD[i]
- indexValues[i] = j; //record the index of the smallest doc[j] DSD in indexValues[i]
- }
- }
- }
- }
- return indexValues;
- }
- public static int documentSimilarityDistance(int fVectorOne[], int fVectorTwo[]) {
- int dsdSum = 0;
- int dsd;
- for (int i = 0; i < fVectorOne.length; i++) {
- dsd = fVectorOne[i] - fVectorTwo[i];
- if (dsd < 0) {
- dsd *= -1;
- }
- dsdSum = dsdSum + dsd;
- }
- return dsdSum;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement