Advertisement
Guest User

Untitled

a guest
Apr 28th, 2015
240
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 6.38 KB | None | 0 0
  1. package student_classes;
  2.  
  3. import java.io.File;
  4. import java.io.FileNotFoundException;
  5. import java.util.ArrayList;
  6. import java.util.HashMap;
  7. import java.util.Map;
  8. import java.util.Scanner;
  9. import java.util.Set;
  10. import java.util.regex.Pattern;
  11.  
  12. // imports should go here ... depending upon your approach ...
  13. // I strongly advise java.util.Scanner, java.util.regex, and most likely
  14. // a dictionary class, such as HashTable.
  15.  
  16. /**
  17.  * @author UMD CS Department:
  18.  * A <code>Concordence</code> is an object that embodies the association of tokens (words taken from a text)
  19.  * to their "context." This class defines "context"  as the the number of
  20.  * times that a particular token (word) occurs in a document (which is a text file).
  21.  * <P>
  22.  * Clients of this class provide a complete pathname to a text file (<file>.txt) and
  23.  * a boolean variable <code>is_case_sensitive</code> that determines whether tokens are stored in
  24.  * the original case or are converted to lower case during the construction of the
  25.  * associations as well as during the retrieval of counts associated with particular
  26.  * tokens. </P>
  27.  * <P>Clients may
  28.  * use the various methods on the <code>Concordence</code> object to retrieve information such as how
  29.  * many times a particular word occurred or lists of words that occurred a number of times.
  30.  * </P>
  31.  * <P>
  32.  * <strong>
  33.  * Special characters, such as syntax marks, are ignored. It is possible, therefore, that hyphenated
  34.  * words or contractions will register incorrectly: For example: "can't" might be "can" "t." Certainly,
  35.  * developers of this class are encouraged to explore Java's regular expressions package to
  36.  * improve this implementation!</strong>
  37.  * </P>
  38.  *
  39.  */
  40. public class Concordence {
  41. // Properties:
  42.     private HashMap<String, Integer> wordMap;
  43.     private boolean is_case_sensitive;
  44. //  Constructor(s):
  45.     /**
  46.      * Default ctor: sets up internal tables ... not usefully called by
  47.      * anyone outside of this class.
  48.      */
  49.     protected Concordence( ) {
  50.         wordMap = new HashMap<String, Integer>();
  51.     }
  52.     /**
  53.      * Main Constructor: requires two parameters:
  54.      * <P>
  55.      * (1) <code>pathName</code> is a <code>String</code> representing a valid pathname, i.e., a pathname
  56.      * whose last component is the name of a "text file." Text files comprise normal characters and are
  57.      * assumed to have the suffix "<filename>.txt".
  58.      * <P>
  59.      * (2) <code> is_case_sensitive</code>, which is a <code>boolean</code>
  60.      * that determines if the capitalization of tokens matters. If the client specifies that
  61.      * <code>is_case_sensitive</code> is <code>True</code>, then the original case of all tokens will be preserved during the
  62.      * construction of the tables (associations) as well as during the retrieval of data that involves
  63.      * the comparison of tokens by the various public methods exposed by the Concordence object.</item>
  64.      * </P>
  65.      * @param pathName (String)
  66.      * @param is_case_sensitive (boolean)
  67.      */
  68.     public Concordence( String pathName, boolean is_case_sensitive ) {
  69.         this();
  70.         this.is_case_sensitive = is_case_sensitive;
  71.         try {
  72.             Scanner doc = new Scanner(new File(pathName));
  73.            
  74.             Pattern pattern = Pattern.compile("\\W");
  75.             while (doc.hasNext()){
  76.                 String word;
  77.                
  78.                 if (is_case_sensitive)
  79.                     word = join(pattern.split(doc.next()));
  80.                 else
  81.                     word = join(pattern.split(doc.next().toLowerCase()));
  82.                
  83.                 if (wordMap.containsKey(word))
  84.                     wordMap.put(word, wordMap.get(word)+1);
  85.                 else
  86.                     wordMap.put(word, 1);
  87.            
  88.             }
  89.            
  90.         } catch (FileNotFoundException e) {
  91.             // TODO Auto-generated catch block
  92.             e.printStackTrace();
  93.         }
  94.        
  95.     }
  96. //  Public Methods:
  97.     /**<P>
  98.      * Given a (String) token, return how many times it occurred in the text.</P>
  99.      * <P>Preconditions: The constructor has been successfully called.</P>
  100.      * <P>Postconditions: a counting number is returned.</P>
  101.      * <P> Note: this method is sensitive to
  102.      * the value of the <code>is_case_sensitive</code> parameter that the user specified during the
  103.      * construction of this instance. If the user specified that case was to be ignored,
  104.      * then all tokens have been installed and will be compared in lower case; otherwise,
  105.      * the original case of the token(s) as they were found in the document will be used.
  106.      */
  107.     public int getTokenCount( String for_token ) {
  108.         Integer value;
  109.         if (this.is_case_sensitive)
  110.             value = wordMap.get(for_token);
  111.         else
  112.             value =  wordMap.get(for_token.toLowerCase());
  113.         return value == null ? 0: value;
  114.  
  115.     }
  116.     /**
  117.      * <P>Preconditions: The Constructor has successfully been called. Note: the <code>by_count</code>
  118.      * argument must be an integer greater than 0 or an Illegal Argument exception is thrown.</P>
  119.      * <P>
  120.      * Postcondition: An <code>Iterable(String)</code> object is returned that contains an unordered
  121.      * list of tokens (which are unique) whose counts equal <code>by_count</code> Note: this list
  122.      * could be empty, but should not be under ordinary circumstances.</P>
  123.      * @param by_count <code>(int > 0)</code>
  124.      * @return <code>Iterable(String)</code> An Iterable<String> object (which may be empty) that contains the tokens whose counts
  125.      * equal the <code>by_count</code> (int) parameter.
  126.      */
  127.     public Iterable<String> getTokensByCount( int by_count ) {
  128.         if (by_count > 0){
  129.             ArrayList<String> tokens = new ArrayList<String>();
  130.             Set<Map.Entry<String, Integer>> entries = wordMap.entrySet();
  131.             for (Map.Entry<String, Integer> ent: entries){
  132.                 if (ent.getValue() == by_count){
  133.                     if (!tokens.contains(ent.getKey()))
  134.                         tokens.add(ent.getKey());
  135.                 }
  136.             }    
  137.             return tokens;
  138.         }else {
  139.             throw new IllegalArgumentException("Valid count not given");
  140.         }
  141.     }
  142.     /**
  143.      * <P>Preconditions: The Constructor for this class has been successfully called.
  144.      * </P>
  145.      * <P>Postconditions: The current size, which is the number of entries, in the Concordence table is returned.
  146.      * @return<code> int>= 0</code>
  147.      */
  148.     public final int size() {
  149.         return wordMap.size();
  150.     }
  151.    
  152. //  Overrides ...
  153.     /**
  154.      * Returns a String that identifies this object and provides a little detail ...
  155.      */
  156.     public String toString() {
  157.         String str = "Total words: " + size() + "\n";
  158.         str += (this.is_case_sensitive)? "Case Sensitive" : "Case Insensitive";
  159.         return str;
  160.     }
  161.    
  162.     private String join(String[] parts){
  163.         String newString = "";
  164.         for (int i = 0; i < parts.length; i++)
  165.             newString += parts[i];
  166.         return newString;
  167.     }
  168. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement