Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.*;
- import java.util.*;
- public class Assignment1 {
- public static void main(String[] args) {
- try {
- System.out.print("Enter name of a directory> ");
- Scanner scan = new Scanner(System.in);
- File dir = new File(scan.nextLine());
- File[] fileList = dir.listFiles();
- //stop words array
- List<String> stopWords = Arrays.asList("i", "was", "the", "and", "am", "an", "it", "is", "a", "of", "&", "for", "this", "in", "with");
- //stemmer stuff
- Stemmer s = new Stemmer();
- String filePath = dir.toString();
- //Map<String, String> wordanDoc = new HashMap<String, String>();
- Map<String, String> invIndex = new HashMap<String, String>();
- Set<String> allWords= new HashSet<String>();
- for(File f: fileList) {
- Scanner sc = new Scanner(f);
- while(sc.hasNextLine()) {
- StringTokenizer st = new StringTokenizer(sc.nextLine());
- while (st.hasMoreTokens()) {
- String word = st.nextToken();
- //file location testing purpose: C:\Users\brobs\OneDrive\Desktop\Classes\taglines\taglines
- // /home/brobs0111/Documents/taglines/
- /// /home/brobs0111/Documents/alldocs/
- //replace all punc besides hyphens and forward slashes and convert all to lower case
- word = word.replaceAll("[^\\w\\/\\+\\-]","").toLowerCase();
- //if the word is a stop word it breaks out of loop
- if(stopWords.contains(word) )
- continue;
- //the current file path is saved to docName then everything besides the name is removed
- String docName=f.getPath()
- .replace(filePath,"")
- .replace("/","")
- .replace(".txt","");
- //if(invIndex.containsValue(word)){
- // String temp = wordanDoc.get(word).concat("," + docName);
- // wordanDoc.put(word, temp);
- //}else{
- // wordanDoc.put(word, docName);
- //}
- allWords.add(word);
- // adding to stemmer
- for (int i=0; i<word.length(); i++)
- { if(!word.contains("-"))
- if (Character.isLetter(word.charAt(i))) {
- s.add(word.charAt(i));
- }
- }
- //prevent hyphened words from being stemmed
- if(!word.contains("-")) {
- s.stem();
- if(invIndex.containsKey(s.toString())){
- String temp = invIndex.get(s.toString()).concat("," + docName);
- invIndex.put(s.toString(), temp);}
- else invIndex.put(s.toString(), docName);
- }
- //adds unstemmed word to wordIndex
- else {
- if(invIndex.containsKey(word)){
- String temp = invIndex.get(word).concat("," + docName);
- invIndex.put(word, temp);}
- else invIndex.put(word, docName);
- }
- }
- }
- }
- //Map<String, String> sorted= new TreeMap<>();
- //sorted.putAll(wordanDoc);
- //for(Map.Entry<String, String> entry: sorted.entrySet())
- // System.out.println("Word: " + entry.getKey()+ " DocID's<" + entry.getValue() + ">");
- //wordanDoc.entrySet().forEach(entry->{
- // System.out.println("Word: "+entry.getKey() + "|| DocID <" + entry.getValue()+"> ");
- //});
- invIndex.entrySet().forEach(entry->{
- System.out.println("Word: "+entry.getKey() + "|| DocID <" + entry.getValue()+"> ");
- });
- }
- catch(Exception e) {
- System.out.println("Error: " + e.toString());
- }
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement