Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- public static void main(String[] args) throws Exception {
- CommandLine cmdLine;
- try {
- cmdLine = processArgs(args);
- }
- catch (Exception e) {
- return;
- }
- int n = Integer.parseInt(cmdLine.getOptionValue("n"));
- String fName = cmdLine.getOptionValue("f");
- String outFileName = cmdLine.getOptionValue("o");
- boolean allNGrams = false;
- long keepMax = 0;
- if(cmdLine.hasOption("max")) {
- keepMax = Long.parseLong(cmdLine.getOptionValue("m"));
- }
- System.out.println("Counting items...");
- BufferedReader inputStream = new BufferedReader(new InputStreamReader(new FileInputStream(fName),"UTF8"));
- long numItems = 0;
- while(true) {
- String line1 = inputStream.readLine();
- if(line1==null) {break;}
- String line2 = inputStream.readLine();
- if(line2==null) {break;}
- numItems++;
- }
- System.out.println(numItems + " items counted");
- inputStream.close();
- long totalNum = 0;
- long numSentences = 0;
- Map<String, Long> ngramHash = new HashMap();
- inputStream = new BufferedReader(new InputStreamReader(new FileInputStream(fName),"UTF8"));
- ProgressTimer myTimer = new ProgressTimer(numItems);
- double numSentenceTokens = 0;
- Map<Long, Set<String>> freqMap = new TreeMap();
- while (true) {
- // The file consists of line-couples:
- // First line contains a frequency number
- String freqString = inputStream.readLine();
- if(freqString==null) {break;}
- long sentenceFreq=0;
- try { sentenceFreq = Long.parseLong(freqString); }
- catch (Exception exp) { break; }
- // And second line contains a sentence
- String sentenceString = inputStream.readLine();
- if(sentenceString==null) {break;}
- // This means that the sentence just read occurs sentenceFreq times
- StringTokenizer tokenizer = new StringTokenizer(sentenceString);
- // q is used for storing the n-gram
- Queue<String> q = new LinkedList<String>();
- while (tokenizer.hasMoreTokens()) {
- String token = tokenizer.nextToken();
- numSentenceTokens++;
- q.add(token);
- if (q.size() >= n) {
- String nGram = q.poll();
- for (int i=0; i<n-1; i++) {
- String gram = q.poll();
- nGram = nGram + " " + gram;
- q.add(gram);
- }
- Long prevVal = ngramHash.get(nGram);
- if (prevVal==null) { prevVal = new Long(0); }
- Long newVal = prevVal + sentenceFreq;
- ngramHash.put(nGram, newVal);
- // If we want to keep only the keepMax most frequent entries:
- if (keepMax > 0) {
- // here we maintain a frequency map which maps frequencies
- // to sets of strings. Whenever the main ngramHash map is updated
- // the freqMap is also updated.
- if (prevVal != 0L) {
- Set prevSet = freqMap.get(prevVal);
- prevSet.remove(nGram);
- if (prevSet.isEmpty()) {freqMap.remove(prevVal); prevSet = null;}
- }
- Set newSet = freqMap.get(newVal);
- if (newSet == null) {
- newSet = new LinkedHashSet();
- freqMap.put(newVal, newSet);
- }
- newSet.add(nGram);
- // If our main hashmap exceeds the keepMax size, we remove one of its lowest-
- // frequent entries - of course, we also update freqMap
- if (ngramHash.size() > keepMax) {
- Iterator <Set <String>> it = freqMap.values().iterator();
- if (it.hasNext()) {
- Set <String>lowestFreqStrings = it.next();
- Iterator <String> it2 = lowestFreqStrings.iterator();
- String strToRemove = it2.next();
- it2.remove();
- ngramHash.remove(strToRemove);
- if (lowestFreqStrings.isEmpty()) {
- it2 = null;
- it.remove();
- }
- }
- }
- }
- }
- token = null;
- }
- q = null;
- tokenizer = null;
- sentenceString = null;
- freqString = null;
- totalNum += sentenceFreq;
- numSentences++;
- if (myTimer.printProgress(numSentences, "MapSize: " + ngramHash.size() + " FreqMapSize:" + freqMap.size())) {
- Iterator <Set <String>>it3 = freqMap.values().iterator();
- int sumItems = 0;
- while(it3.hasNext()) {
- Set <String> currSet = it3.next();
- sumItems += currSet.size();
- }
- float avg = new Float(sumItems) / new Float (freqMap.size());
- System.out.println("Avg freqmap size:"+avg + " Total strings in freqmap:" + sumItems);
- }
- }
- inputStream.close();
- System.out.println(numSentences + " unique sentences counted");
- System.out.println(totalNum + " sentence instances counted");
- System.out.println((numSentenceTokens / (new Double(numSentences))) + " tokens / sentence on average");
- Iterator it = ngramHash.entrySet().iterator();
- List<Tuple> ngramList = new ArrayList();
- System.out.println("Converting hashMap to ArrayList for sorting...");
- while (it.hasNext()) {
- Map.Entry pairs = (Map.Entry)it.next();
- ngramList.add(new NGramCounter.Tuple(pairs.getKey(), (Long)pairs.getValue()));
- it.remove();
- }
- System.out.println("Sorting...");
- Collections.sort(ngramList, Collections.reverseOrder());
- System.out.println("Saving to " + outFileName );
- Writer out = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(outFileName), "UTF-8"));
- for (Tuple<String> t: ngramList) {
- out.write(String.valueOf(t.second) + "\n");
- out.write(t.first + "\n");
- }
- out.close();
- System.out.println("Done");
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement