Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.File;
- import java.io.FileNotFoundException;
- import java.util.*;
- public class Main {
- public static void main(String[] args) throws Exception {
- //ArrayList<String> l1 = new ArrayList<>(Arrays.asList(new String("Happy families are all alike; every unhappy family is unhappy in its own way Everything was in confusion in the Oblonskys' house. The wife had discovered that the husband was carrying on an intrigue with a French girl, who had been a governess in their family, and she had announced to her husband that she could not go on living in the same house with him.").toLowerCase().split(" ")));
- //ArrayList<String> l2 = new ArrayList<>(Arrays.asList(new String("In the words of Leo Tolstoy: Every unhappy family is unhappy in its own way").toLowerCase().split(" ")));
- ArrayList l1 = readDocument("/home/lofibytes/Downloads/moby_dick.txt");
- ArrayList l2 = readDocument("/home/lofibytes/Downloads/huck_finn_plagiarized.txt");
- ArrayList<String> duplicates = getDuplicatesList(Arrays.copyOf(l1.toArray(), l1.size(),String[].class), Arrays.copyOf(l2.toArray(), l2.size(),String[].class));
- System.out.println(duplicates);
- }
- private static ArrayList<String> getDuplicatesList(String[] l1, String[] l2) {
- long millis = System.currentTimeMillis();
- ArrayList<String> duplicates = new ArrayList<>();
- HashMap<String, ArrayList<Integer>> words = new HashMap<>();
- for (int i = 0; i < l2.length; i += 1) { // unindexed word create arraylist of locations of word
- if (words.get(l2[i]) == null) {
- ArrayList wordLocations = new ArrayList();
- wordLocations.add(i);
- words.put(l2[i], wordLocations);
- } else {
- words.get(l2[i]).add(i); // Word already indexed so just add the next location
- }
- }
- ArrayList<Integer> indexes;
- String match;
- int i, j, k, offset;
- for (i = 0; i < l1.length; i ++) {
- if (!words.containsKey(l1[i])) continue;
- indexes = words.get(l1[i]);
- if (indexes != null) {
- for (j = 0; j < indexes.size(); j++) {
- match = l1[i];
- offset = 0;
- for (k = indexes.get(j) + 1; k < l2.length && i + offset < l1.length; k++) {
- offset++;
- if (l1[i + offset].equalsIgnoreCase(l2[k]))
- match += " " + l2[k];
- else {
- i += offset - 1;
- break;
- }
- }
- if (offset <= 3) continue;
- duplicates.add(match);
- }
- }
- }
- System.out.println((System.currentTimeMillis() - millis) / 1000);
- return duplicates;
- }
- /*private static ArrayList<String> getDuplicatesList(ArrayList<String> l1, ArrayList<String> l2) {
- ArrayList<String> duplicates = new ArrayList<>();
- long n1 = System.nanoTime();
- for (int i = 0; i < l1.size(); i++)
- for (int j = 0; j < l2.size(); j++)
- if (l1.get(i).equalsIgnoreCase(l2.get(j))) {
- String match = l1.get(i);
- for (int k = 1; i + k < l1.size() && j + k < l2.size(); k++)
- if (l1.get(i + k).equalsIgnoreCase(l2.get(j + k)))
- match += " " + l2.get(k + j);
- else {
- j += k - 1;
- break;
- }
- int longest = 0;
- for (int l = 0; l < duplicates.size(); l++)
- if (longest < duplicates.get(l).length())
- longest = duplicates.get(l).length();
- if (longest < match.length())
- duplicates.add(match);
- }
- System.out.println(System.nanoTime() - n1);
- return duplicates;
- }
- */
- /*
- private static ArrayList<String> getDuplicatesList(ArrayList<String> l3, ArrayList<String> l4) {
- ArrayList<String> duplicates = new ArrayList<>();
- LinkedHashSet<String> l1 = new LinkedHashSet<>(l3);
- LinkedHashSet<String> l2 = new LinkedHashSet<>(l4);
- Iterator<String> i = l1.iterator();
- int _i = -1;
- long n1 = System.nanoTime();
- while (i.hasNext()) {
- _i++;
- String iVal = i.next();
- if (l2.contains(iVal)) {
- System.out.println(iVal);
- for (int e = _i; e < l3.size(); e++) {
- for (int j = 0; j < l4.size(); j++) {
- if (l3.get(e).equalsIgnoreCase(iVal) && l4.get(j).equalsIgnoreCase(iVal)) {
- String match = iVal;
- for (int k = 1; e + k < l3.size() && j + k < l4.size(); k++)
- if (l3.get(e + k).equalsIgnoreCase(l4.get(j + k)))
- match += " " + l4.get(k + j);
- else
- break;
- int longest = 0;
- for (int l = 0; l < duplicates.size(); l++)
- if (longest < duplicates.get(l).length())
- longest = duplicates.get(l).length();
- if (longest < match.length())
- duplicates.add(match);
- }
- }
- }
- }
- }
- System.out.println(System.nanoTime() - n1);
- return duplicates;
- }*/
- public static ArrayList<String> readDocument(String fileName) throws FileNotFoundException
- {
- Scanner scanner;
- ArrayList<String> words = new ArrayList<String>();
- scanner = new Scanner(new File(fileName));
- scanner.useDelimiter("[\\W]"); // all non-word characters
- int k = 0;
- String word = "";
- while (scanner.hasNext())
- {
- String cur = scanner.next();
- if (!cur.equals(""))
- {
- k++;
- if (k == 1) {
- word = cur.toLowerCase();
- } else
- word += " " + cur.toLowerCase();
- if (k >= 3 || !scanner.hasNext()) {
- words.add(word.trim());
- word = "";
- }
- }
- }
- scanner.close();
- return words;
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement