Untitled

import java.util.*;

// author: j.lines@uea.ac.uk

public class CourseworkUtilities {

    public static Random rand = new Random(); // seed this if you want reproducible results

    // example usage creating a dictionary and a document
    public static void main(String[] args) throws Exception {
        for (int j = 0; j < 1; j++) {

            int numDocsMultiArray = 5;

            int numWordsInDoc = 10;
            int numWordsDict = 5;
            int wordLength = 2;


            int dsd = 0;

            // generate the dictionary
            String[] dict = generateDictionary(numWordsDict, wordLength);
            int [] nearestDocuments = new int[dict.length];
        /* printing dictionary
        System.out.println("dictionary:");
        for (int i = 0; i < dict.length; i++) {
            //System.out.print(dict[i]+" ");
        }
        System.out.println();
        */
            // generating document by passing in the dictionary and a total number
            // of words for the document. The document will then be generated using
            // unseen words and some words from the dictionary. These are sampled using
            // the Random object on line 6 - seed this if you would like results to be
            // reproducible
            String[] doc = generateDocument(dict, numWordsInDoc);
            String[] doc1 = generateDocument(dict, numWordsInDoc);

        /*printing document
        System.out.println("\nDocument:");
        for (int i = 0; i < doc.length; i++) {
            //System.out.print(doc[i]+" ");
        }
        System.out.println();
        */
            long startTime = System.nanoTime();     //gets start time of methods execution time
            int[] featureVector = calculateFeatureVector(dict, doc);
            int[] featureVector1 = calculateFeatureVector(dict, doc1);
            long endTime = System.nanoTime();       //gets end time of methods execution time
            long duration = (endTime - startTime);  //calculate total duration in milliseconds

            //System.out.println("\nFeature Vector:");
            //for (int i = 0; i < featureVector.length; i++) {
            //    System.out.print(featureVector[i] + "     ");
            //}

            //System.out.println(duration);

            String[][] twoDimenArrDoc = generateTwoDimenArrDoc(dict, numWordsInDoc, numDocsMultiArray);

            dsd = documentSimilarityDistance(featureVector, featureVector1);

            nearestDocuments = findNearestDocuments(twoDimenArrDoc, dict);

            System.out.print(Arrays.toString(featureVector) + Arrays.toString(featureVector1) + "DSD = " + dsd + "\n");


            for (int i = 0; i < nearestDocuments.length; i++) {
                System.out.println("Nearest documents to " + i + " is " + nearestDocuments[i]);
            }
            //int[] dsdArr = new int[twoDimenArrDoc.length];

            //dsdArr = findNearestDocuments(twoDimenArrDoc, dict);

            //System.out.println(dsdArr);


        }
    }

    // generate a word of a given length by randomly generating letters
    public static String generateWord(int wordLength) {
        StringBuilder st = new StringBuilder();
        for (int i = 0; i < wordLength; i++) {
            st.append((char) (rand.nextInt(26) + 'a'));
        }
        return st.toString();
    }

    // generate a dictionary for a given word length and number of words to generate. Note that duplicates are not
    // allowed so this method checks before adding a new word. HOWEVER, as noted, this is a crude implementation and
    // not the most efficient. This is fine for now however and you'll learn better ways later on.
    public static String[] generateDictionary(int numWords, int wordLength) throws Exception {

        if (Math.pow(26, wordLength) < numWords) {
            throw new Exception("Error: the input arguments could only result in "
                    + "26^" + wordLength + " (" + ((int) (Math.pow(26, wordLength))) + ") distinct words but the"
                    + " numWords argument is set to " + numWords);
        }

        // remember - DO NOT USE IN-BUILT JAVA DATA STRUCTURES IN YOUR OWN CODE FOR THIS ASSIGNMENT (you can still
        // use arrays wherever you like, however)
        //
        // It is fine to use ArrayList here as this has been given to you but do not use it
        // anywhere else in your coursework.

        ArrayList<String> dictionary = new ArrayList<>(numWords);
        String temp;
        while (dictionary.size() < numWords) {
            temp = generateWord(wordLength);
            if (!dictionary.contains(temp)) {
                dictionary.add(temp);
            }
        }
        return dictionary.toArray(new String[dictionary.size()]);
    }

    // similar to generating a dictionary but simpler - generate a given number of random words of a specified length.
    // No need to check for duplicates here - this method just fills up your document with other words for
    // testing/timing but it doesn't matter what they are
    public static String[] generateFillerWords(int numWords, int wordLength) {

        String[] output = new String[numWords];
        for (int i = 0; i < numWords; i++) {
            output[i] = generateWord(wordLength);
        }
        return output;
    }

    // uses all of the above to generate a document when passed a dictionary. Randomly samples with a uniform
    // distribution (i.e. each word is as likely to be picked as any other) so for very large documents you should
    // expect similar counts of each word
    public static String[] generateDocument(String[] dictionary, int numWordsInDoc) {
        // generate other words to fill the document with
        String[] otherWords = generateFillerWords(dictionary.length * 2, dictionary[0].length());

        String[] documentList = new String[numWordsInDoc];

        int nextWordIdx;
        int numDistinctWords = dictionary.length * 3;

        StringBuilder st = new StringBuilder();
        for (int i = 0; i < numWordsInDoc; i++) {
            nextWordIdx = rand.nextInt(numDistinctWords);
            if (nextWordIdx < dictionary.length) {
                documentList[i] = dictionary[nextWordIdx];
            } else {
                documentList[i] = otherWords[nextWordIdx - dictionary.length];
            }
        }
        return documentList;
    }

    public static int[] calculateFeatureVector(String dict[], String doc[]) {

        int fVector[] = new int[dict.length];

        for (int i = 0; i < dict.length; i++) {
            for (int j = 0; j < doc.length; j++) {
                if (dict[i].equals(doc[j])) {
                    fVector[i] = fVector[i] + 1; //change to j
                }
            }
        }
        return fVector;
    }

    public static String[][] generateTwoDimenArrDoc(String dict[], int numWordsDoc, int numDocs) {

        String doc[] = generateDocument(dict, numWordsDoc);

        String twoDimenArrDoc[][] = new String[numDocs][numWordsDoc];

        for (int i = 0; i < numDocs; i++) {
            twoDimenArrDoc[i] = generateDocument(dict, numWordsDoc);
        }
        return twoDimenArrDoc;
    }

    public static int[] findNearestDocuments(String twoDimenArrDoc[][], String dict[]) {

        int docDSD[] = new int[twoDimenArrDoc.length];                              //array to hold DSD of two documents to compare
        int indexValues[] = new int[twoDimenArrDoc.length];                         //array to hold indexs of the smallest DSD

        Arrays.fill(docDSD, Integer.MAX_VALUE);                                     //fill DSD array of max int's ready for comparison

        int[][] twoDimenArrFeatureVectors = new int[twoDimenArrDoc.length][];       //intialize 2d array to store feature vectors of 2 docs


        //populate all features vectors
        for (int i = 0; i < twoDimenArrDoc.length; i++) {
            twoDimenArrFeatureVectors[i] = calculateFeatureVector(dict, twoDimenArrDoc[i]); //populate 2d array with feature vectors of all documents
        }

        for (int i = 0; i < twoDimenArrFeatureVectors.length; i++) {
            for (int j = 0; j < twoDimenArrFeatureVectors.length; j++) {
                if (i != j) {                                                               //ensure the dsd is not compared with its self
                    int distance = documentSimilarityDistance(twoDimenArrFeatureVectors[i], twoDimenArrFeatureVectors[j]);  // calculate the DSD of doc[i] with doc[j]
                    if (distance < docDSD[i]) {                                             //compare the DSD of doc[i] & doc[j] to the current smallest DSD in docDSD[i]
                        docDSD[i] = distance;                                               //if smaller insert the DSD into docDSD[i]
                        indexValues[i] = j;                                                 //record the index of the smallest doc[j] DSD in indexValues[i]
                    }
                }
            }
        }
        return indexValues;
    }

    public static int documentSimilarityDistance(int fVectorOne[], int fVectorTwo[]) {
        int dsdSum = 0;
        int dsd;
        for (int i = 0; i < fVectorOne.length; i++) {
            dsd = fVectorOne[i] - fVectorTwo[i];
            if (dsd < 0) {
                dsd *= -1;
            }
            dsdSum = dsdSum + dsd;
        }
        return dsdSum;
    }
}