Guest User

Untitled

a guest
Aug 21st, 2018
139
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 6.58 KB | None | 0 0
  1. /**
  2.  * This program reads a file containing documents
  3.  * (one per line), builds a keyword index and searches
  4.  * the index for given search term.
  5.  * Usage: java SearchIndex DocumentFile SearchTerm
  6.  * @author: ingo.bax at fh-muenster.de
  7.  * @version: SoSe 2011
  8.  */
  9.  
  10. import java.io.*;
  11. import java.util.*;
  12.  
  13. public class SearchIndex {
  14.  
  15.     private static final String DELIMITERS = " \t\n\f!@#$%^&*()_+{}[]:\";'<>,.?/";
  16.  
  17.     public static void main(String[] args) {
  18.  
  19.         // check arguments
  20.         if (args.length<2) {
  21.             System.err.println("Usage: java SearchIndex DocumentFile SearchTerm");
  22.             return;
  23.         }
  24.         String documentsFileName = args[0];
  25.         String searchTerm = args[1];
  26.  
  27.         // for benchmarks
  28.         long time;
  29.  
  30.         // read documents from file
  31.         System.out.print("Reading documents -> ");
  32.         time = System.currentTimeMillis();
  33.         ArrayList<String> documents = readDocuments(documentsFileName);
  34.         System.out.println("done. " + documents.size() + " documents read in " + (System.currentTimeMillis() - time) + " ms.");
  35.  
  36.         // build index from documents
  37.         System.out.print("Building index -> ");
  38.         time = System.currentTimeMillis();
  39.         ArrayList<IndexEntry> index = buildIndex(documents);
  40.         System.out.println("done. " + index.size() + " index entries created in " + (System.currentTimeMillis() - time) + " ms.");
  41.  
  42.         /*
  43.          * // find search term using binary search
  44.          */
  45.         /*
  46.         System.out.print("Searching index with binary search -> ");
  47.         time = System.currentTimeMillis();
  48.         int documentID = lookupBinary(index, searchTerm);
  49.         System.out.println("done in " + (System.currentTimeMillis() - time) + " ms.");
  50.  
  51.         // print result
  52.         if (documentID == -1) {
  53.             System.out.println("No Document found.");
  54.         }
  55.         else {
  56.             System.out.println("Match found in document " + documentID + ":");
  57.             System.out.println(documents.get(documentID));
  58.         }
  59.         */
  60.        
  61.         System.out.print("Searching index with binary search -> ");
  62.         time = System.currentTimeMillis();
  63.         List<Integer> documentIDList = lookupBinaryWithList(index, searchTerm);
  64.        
  65.         if(documentIDList.size() > 0) {
  66.             for(Integer documentID : documentIDList)
  67.             {
  68.                 System.out.println("Match found in document " + documentID + ":");
  69.                 System.out.println(documents.get(documentID));
  70.             }
  71.         }
  72.         else {
  73.             System.out.println("No Document found.");
  74.         }
  75.         System.out.println("done in " + (System.currentTimeMillis() - time) + " ms.");
  76.     }
  77.  
  78.     /**
  79.      * Reads all documents from a specified file
  80.      * @param filename String containing the documents one per line
  81.      * @return ArrayList containing all documents as Strings
  82.      */
  83.     private static ArrayList<String> readDocuments(String filename) {
  84.         ArrayList<String> documents = new ArrayList<String>();
  85.         try {
  86.             BufferedReader br = new BufferedReader(new FileReader(filename));
  87.             while (true) {
  88.                 String line = br.readLine();
  89.                 if (line == null) break;
  90.                 documents.add(line);
  91.             }
  92.             br.close();
  93.         }
  94.         catch (IOException e) {
  95.             throw new RuntimeException("Could not read documents from " + filename + ": " + e);
  96.         }
  97.         return documents;
  98.     }
  99.  
  100.     /**
  101.      * Reads all entries from the dictionary file
  102.      * @param filename String containing the name of the word list file
  103.      * @return ArrayList containing all dictionary entries as Strings
  104.      */
  105.     private static ArrayList<IndexEntry> buildIndex(ArrayList<String> documents) {
  106.         // initialize index
  107.         ArrayList<IndexEntry> index = new ArrayList<IndexEntry>();
  108.  
  109.         // create index elements from documents
  110.         for (int documentID=0; documentID<documents.size(); documentID++) {
  111.             StringTokenizer t = new StringTokenizer(documents.get(documentID), DELIMITERS);
  112.             while (t.hasMoreTokens()) {
  113.                 index.add(new IndexEntry(t.nextToken(),documentID));
  114.             }
  115.         }
  116.        
  117.         // sort index
  118.         Collections.sort(index);
  119.        
  120.         return index;
  121.     }
  122.  
  123.     /**
  124.      * Seaches a specified index for a specified key
  125.      * @param index ArrayList of IndexEntries
  126.      * @param searchTerm String search term that should be found
  127.      * @return documentID of the first document found that contains the specified search term.
  128.      * -1 if no document was found.
  129.      */
  130.     @SuppressWarnings("unused")
  131.     private static int lookupBinary(ArrayList<IndexEntry> index, String searchTerm) {
  132.         int i = Collections.binarySearch(index,new IndexEntry(searchTerm,0));
  133.        
  134.         if (i==-1) {
  135.             return -1;
  136.         }
  137.         else {
  138.             return index.get(i).documentID;
  139.         }
  140.     }
  141.    
  142.     /**
  143.      * Searches a specified index for a specified key
  144.      * @param index ArrayList of IndexEntries
  145.      * @param searchTerm String search term that should be found
  146.      * @return documentID Array of the documents found that contains the specified search term.
  147.      * empty if no document was found.
  148.      */
  149.     @SuppressWarnings("unchecked")
  150.     private static ArrayList<Integer> lookupBinaryWithList(ArrayList<IndexEntry> index, String searchTerm) {
  151.         ArrayList<IndexEntry> indexSearched = new ArrayList<IndexEntry>();
  152.         ArrayList<Integer> indexMatchList = new ArrayList<Integer>();  
  153.        
  154.         // Copy index
  155.         indexSearched = (ArrayList<IndexEntry>) index.clone();
  156.        
  157.         int i = 1;
  158.         while(i > 0)
  159.         {
  160.             i = Collections.binarySearch(indexSearched,new IndexEntry(searchTerm,0));
  161.        
  162.             if (i > 0) {
  163.                 int documentID = index.get(i).documentID;
  164.                
  165.                 // Check whether the documentId already exist
  166.                 if(!indexMatchList.contains(documentID)) {
  167.                     indexMatchList.add(index.get(i).documentID);
  168.                 }
  169.                
  170.                 // Remove indexEntry from temporary index
  171.                 indexSearched.remove(i);           
  172.             }
  173.         }
  174.        
  175.         return indexMatchList;
  176.     }
  177.    
  178.     /**
  179.      * Representation of an entry in the search index containing an
  180.      * uppercase key and a documentID.
  181.      * (This class implements the interface Comparable<IndexEntry>, so that
  182.      * an ArrayList<IndexEntry> can be easily sorted using Collections.sort
  183.      * and subsequently searched using Collections.binarySearch)
  184.      */
  185.     private static class IndexEntry implements Comparable<IndexEntry> {
  186.         public String key;
  187.         public int documentID;
  188.  
  189.         /**
  190.          * Creates an IndexEntry object
  191.          * @param key String containing the key
  192.          * @param documentID the id of the document the key was found in
  193.          */
  194.         public IndexEntry(String key, int documentID) {
  195.             this.key = key.toUpperCase();
  196.             this.documentID = documentID;
  197.         }
  198.  
  199.         /**
  200.          * Compares the key of this IndexEntry to the key of a specified IndexEntry
  201.          * @param e an IndexEntry
  202.          * @return 0 in case the keys are equal,
  203.          *         -1 if the key of this IndexEntry appears before the key of specifid IndexEntry
  204.          *         1 if the key of this IndexEntry appears after the key of specifid IndexEntry
  205.          */
  206.         public int compareTo(IndexEntry e) {
  207.             return this.key.compareTo(e.key);
  208.         }      
  209.     }
  210. }
Add Comment
Please, Sign In to add comment