Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.util.*;
- // author: j.lines@uea.ac.uk
- public class CourseworkUtilities {
- public static Random rand = new Random(); // seed this if you want reproducible results
- // example usage creating a dictionary and a document
- public static void main(String[] args) throws Exception{
- for (int j = 0; j < 1; j++) {
- int numDocsMultiArray = 5;
- int numWordsInDoc = 10;
- int numWordsDict = 5;
- int wordLength = 2;
- int dsd = 0;
- // generate the dictionary
- String[] dict = generateDictionary(numWordsDict, wordLength);
- /* printing dictionary
- System.out.println("dictionary:");
- for (int i = 0; i < dict.length; i++) {
- //System.out.print(dict[i]+" ");
- }
- System.out.println();
- */
- // generating document by passing in the dictionary and a total number
- // of words for the document. The document will then be generated using
- // unseen words and some words from the dictionary. These are sampled using
- // the Random object on line 6 - seed this if you would like results to be
- // reproducible
- String[] doc = generateDocument(dict, numWordsInDoc);
- String[] doc1 = generateDocument(dict, numWordsInDoc);
- /*printing document
- System.out.println("\nDocument:");
- for (int i = 0; i < doc.length; i++) {
- //System.out.print(doc[i]+" ");
- }
- System.out.println();
- */
- long startTime = System.nanoTime(); //gets start time of methods execution time
- int[] featureVector = calculateFeatureVector(dict, doc);
- int[] featureVector1 = calculateFeatureVector(dict, doc1);
- long endTime = System.nanoTime(); //gets end time of methods execution time
- long duration = (endTime - startTime); //calculate total duration in milliseconds
- //System.out.println("\nFeature Vector:");
- //for (int i = 0; i < featureVector.length; i++) {
- // System.out.print(featureVector[i] + " ");
- //}
- //System.out.println(duration);
- String[][] twoDimenArrDoc = generateTwoDimenArrDoc(dict, numWordsInDoc, numDocsMultiArray);
- int [] testArray = new int[dict.length];
- dsd = documentSimilarityDistance(featureVector, featureVector1);
- findNearestDocuments(twoDimenArrDoc, dict);
- System.out.print(Arrays.toString(featureVector) + Arrays.toString(featureVector1) + "DSD = " + dsd + "\n");
- //int[] dsdArr = new int[twoDimenArrDoc.length];
- //dsdArr = findNearestDocuments(twoDimenArrDoc, dict);
- //System.out.println(dsdArr);
- }
- }
- // generate a word of a given length by randomly generating letters
- public static String generateWord(int wordLength){
- StringBuilder st = new StringBuilder();
- for(int i =0; i < wordLength; i++){
- st.append((char)(rand.nextInt(26)+'a'));
- }
- return st.toString();
- }
- // generate a dictionary for a given word length and number of words to generate. Note that duplicates are not
- // allowed so this method checks before adding a new word. HOWEVER, as noted, this is a crude implementation and
- // not the most efficient. This is fine for now however and you'll learn better ways later on.
- public static String[] generateDictionary(int numWords, int wordLength) throws Exception{
- if(Math.pow(26,wordLength) < numWords){
- throw new Exception("Error: the input arguments could only result in "
- +"26^"+wordLength+" ("+((int)(Math.pow(26,wordLength)))+ ") distinct words but the"
- +" numWords argument is set to "+numWords);
- }
- // remember - DO NOT USE IN-BUILT JAVA DATA STRUCTURES IN YOUR OWN CODE FOR THIS ASSIGNMENT (you can still
- // use arrays wherever you like, however)
- //
- // It is fine to use ArrayList here as this has been given to you but do not use it
- // anywhere else in your coursework.
- ArrayList<String> dictionary = new ArrayList<>(numWords);
- String temp;
- while(dictionary.size() < numWords){
- temp = generateWord(wordLength);
- if(!dictionary.contains(temp)){
- dictionary.add(temp);
- }
- }
- return dictionary.toArray(new String[dictionary.size()]);
- }
- // similar to generating a dictionary but simpler - generate a given number of random words of a specified length.
- // No need to check for duplicates here - this method just fills up your document with other words for
- // testing/timing but it doesn't matter what they are
- public static String[] generateFillerWords(int numWords, int wordLength){
- String[] output = new String[numWords];
- for(int i = 0; i < numWords; i++){
- output[i] = generateWord(wordLength);
- }
- return output;
- }
- // uses all of the above to generate a document when passed a dictionary. Randomly samples with a uniform
- // distribution (i.e. each word is as likely to be picked as any other) so for very large documents you should
- // expect similar counts of each word
- public static String[] generateDocument(String[] dictionary, int numWordsInDoc){
- // generate other words to fill the document with
- String[] otherWords = generateFillerWords(dictionary.length*2,dictionary[0].length());
- String[] documentList = new String[numWordsInDoc];
- int nextWordIdx;
- int numDistinctWords = dictionary.length*3;
- StringBuilder st = new StringBuilder();
- for(int i = 0; i < numWordsInDoc;i++){
- nextWordIdx = rand.nextInt(numDistinctWords);
- if(nextWordIdx < dictionary.length) {
- documentList[i] = dictionary[nextWordIdx];
- }else{
- documentList[i] = otherWords[nextWordIdx-dictionary.length];
- }
- }
- return documentList;
- }
- public static int[] calculateFeatureVector(String dict[], String doc[]){
- int fVector[] = new int [dict.length];
- for (int i = 0; i < dict.length; i++) {
- for (int j = 0; j < doc.length; j++) {
- if (dict[i].equals(doc[j])){
- fVector[i] = fVector[i] + 1; //change to j
- }
- }
- }
- return fVector;
- }
- public static String[][] generateTwoDimenArrDoc(String dict[], int numWordsDoc, int numDocs){
- String doc[] = generateDocument(dict, numWordsDoc);
- String twoDimenArrDoc [][] = new String[numDocs][numWordsDoc];
- for (int i = 0; i < numDocs; i++) {
- twoDimenArrDoc[i] = generateDocument(dict, numWordsDoc);
- }
- return twoDimenArrDoc;
- }
- public static void findNearestDocuments(String twoDimenArrDoc[][], String dict[]){
- int [] featureVector1 = new int [dict.length];
- int [] featureVector2 = new int [dict.length];
- int arrayTwoLength = twoDimenArrDoc[0].length;
- int[][] twoDimenArrFeatureVectors = new int[twoDimenArrDoc.length][twoDimenArrDoc[0].length];
- int [] findNearestDocument = new int[twoDimenArrDoc[0].length];
- for (int i = 0; i < twoDimenArrDoc.length; i++) {
- featureVector1 = calculateFeatureVector(dict, twoDimenArrDoc[i]);
- for (int j = 0; j < twoDimenArrDoc[0].length - 1; j++) {
- if (j == i){
- twoDimenArrFeatureVectors[i][j] = 99999;
- }
- if (j != i){
- featureVector2 = calculateFeatureVector(dict, twoDimenArrDoc[j]);
- twoDimenArrFeatureVectors[i][j] = documentSimilarityDistance(featureVector1, featureVector2); //[i] = the index of the first feature vector, [j] = the index of the compared doc
- }
- }
- }
- for (int i = 0; i < twoDimenArrFeatureVectors.length; i++) {
- for (int j = 0; j < twoDimenArrFeatureVectors[0].length; j++) {
- if (j + 1 != twoDimenArrFeatureVectors[0].length) {
- if (i != j) {
- if (twoDimenArrFeatureVectors[i][j] < twoDimenArrFeatureVectors[i][j + 1]) ;
- findNearestDocument[i] = twoDimenArrFeatureVectors[i][j];
- }
- }
- }
- }
- }
- public static int documentSimilarityDistance(int fVectorOne[], int fVectorTwo[]){
- int dsdSum = 0;
- for (int i = 0; i < fVectorOne.length; i++) {
- int dsd = 0;
- dsd += fVectorOne[i] - fVectorTwo[i];
- if (dsd < 0){
- dsd *= -1;
- }
- dsdSum = dsdSum + dsd;
- }
- return dsdSum;
- }
- public static int findIndex(int arr[], int t){
- if (arr == null){
- return -1;
- }
- int len = arr.length;
- int i = 0;
- while (i < len){
- if (arr[i] ==t){
- return i;
- }
- else {
- i = i + 1;
- }
- }
- return -1;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement