Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.File;
- import java.lang.Integer;
- import java.lang.System;
- import java.lang.reflect.Array;
- import java.security.MessageDigest;
- import java.util.Comparator;
- import java.security.NoSuchAlgorithmException;
- import java.util.*;
- import java.io.BufferedReader;
- import java.io.FileReader;
- import java.util.InputMismatchException;
- public class MP1 {
- Random generator;
- String userName;
- String inputFileName;
- static final String delimiters = " \t,;.?!-:@[](){}_*/";
- static final String[] stopWordsArray = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
- "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its",
- "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
- "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having",
- "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
- "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
- "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
- "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
- "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
- "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"};
- static final Set<String> stopWordsSet = new HashSet<String>(Arrays.asList(stopWordsArray));
- void initialRandomGenerator(String seed) throws NoSuchAlgorithmException {
- MessageDigest messageDigest = MessageDigest.getInstance("SHA");
- messageDigest.update(seed.toLowerCase().trim().getBytes());
- byte[] seedMD5 = messageDigest.digest();
- long longSeed = 0;
- for (int i = 0; i < seedMD5.length; i++) {
- longSeed += ((long) seedMD5[i] & 0xffL) << (8 * i);
- }
- this.generator = new Random(longSeed);
- }
- // not using this till now
- Integer[] getIndexes() throws NoSuchAlgorithmException {
- Integer n = 10000;
- Integer number_of_lines = 50000;
- Integer[] ret = new Integer[n];
- this.initialRandomGenerator(this.userName);
- for (int i = 0; i < n; i++) {
- ret[i] = generator.nextInt(number_of_lines);
- }
- return ret;
- }
- public MP1(String userName, String inputFileName) {
- this.userName = userName;
- this.inputFileName = inputFileName;
- }
- public String[] process() throws Exception {
- String[] ret = new String[20];
- HashMap<String, Integer> counter = new HashMap<String, Integer>();
- // get the indices for the user id
- Integer[] indices = this.getIndexes();
- Arrays.sort(indices);
- // reading the file here
- List<String> lines = new ArrayList<String>();
- BufferedReader br = new BufferedReader(new FileReader(inputFileName));
- String line = br.readLine();
- while (line != null) {
- lines.add(line);
- line = br.readLine();
- }
- String[] validLines = new String[indices.length];
- for (int i=0; i < validLines.length; i++) {
- validLines[i] = lines.get(indices[i]);
- }
- for (String topic: validLines) {
- ArrayList<String> words = splitWords(topic);
- for (String word: words) {
- int count = counter.containsKey(word) ? counter.get(word) + 1 : 1;
- counter.put(word, count);
- }
- }
- // build a sorted map
- // NOTE: Extremely shitty hack to get the first 20
- TreeMap<String, Integer> sortedMap = SortByValue(counter);
- int index = 0;
- for (Map.Entry<String, Integer> entry : sortedMap.entrySet()) {
- if (index < 20) {
- ret[index] = entry.getKey();
- index += 1;
- } else {
- break;
- }
- }
- return ret;
- }
- private TreeMap<String, Integer> SortByValue(HashMap<String, Integer> map) {
- ValueComparator vc = new ValueComparator(map);
- TreeMap<String, Integer> sortedMap = new TreeMap<String, Integer>(vc);
- sortedMap.putAll(map);
- return sortedMap;
- }
- public ArrayList<String> splitWords(String title) {
- StringTokenizer st = new StringTokenizer(title, delimiters);
- ArrayList<String> words = new ArrayList<String>();
- while (st.hasMoreTokens()) {
- String token = st.nextToken().trim().toLowerCase();
- if (!stopWordsSet.contains(token)) {
- words.add(token);
- }
- }
- return words;
- }
- public static void main(String[] args) throws Exception{
- if (args.length < 1){
- System.out.println("MP1 <User ID>");
- }
- else {
- String userName = args[0];
- String inputFileName = "./input.txt";
- MP1 mp = new MP1(userName, inputFileName);
- String[] topItems = mp.process();
- for (String item: topItems){
- System.out.println(item);
- }
- }
- }
- }
- class ValueComparator implements Comparator<String> {
- Map<String, Integer> map;
- public ValueComparator(Map<String, Integer> base) {
- this.map = base;
- }
- public int compare(String a, String b) {
- if (map.get(a) > map.get(b)) {
- return -1;
- } else if (map.get(a) < map.get(b)) {
- return 1;
- } else { // handle case of inequality
- return a.compareTo(b);
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement