Advertisement
Martina312

[НП] - Text Processor 2

Aug 23rd, 2020
1,953
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Java 5.84 KB | None | 0 0
  1. import java.io.InputStream;
  2. import java.io.OutputStream;
  3. import java.util.*;
  4. import java.util.stream.Collectors;
  5.  
  6. class Text{
  7.     private String line; //linija bez znaci, lower char
  8.     private String original; //originalnata linija potrebna za da se presmeta slicnost
  9.     Map<String, Integer> wordFrequencies; //zbor so negovite pojavuvanja
  10.     List<Integer> freq; //frekvencija na site zborovi pojaveni vo site tekstovi
  11.  
  12.     public Text(String line) {
  13.         this.original = line;
  14.         this.line = line.toLowerCase().replaceAll("[^A-Za-z\\s+]", "");
  15.         this.wordFrequencies = new HashMap<>();
  16.         this.freq = new ArrayList<>();
  17.         fillTheMap();
  18.     }
  19.  
  20.     public void fillTheMap(){
  21.         String [] parts = line.split("\\s+");
  22.         for (int i=0; i<parts.length; i++){
  23.             int finalI = i;
  24.             wordFrequencies.computeIfAbsent(parts[finalI], (k) -> wordFrequencies.put(parts[finalI],0));
  25.             wordFrequencies.put(parts[finalI], wordFrequencies.get(parts[finalI])+1);
  26.         }
  27.     }
  28.  
  29.     public String getFreq(Set<String> words){
  30.         StringBuilder sb = new StringBuilder();
  31.         sb.append("[");
  32.         words.forEach(word -> {
  33.             if (wordFrequencies.containsKey(word)) {
  34.                 sb.append(wordFrequencies.get(word) + ", ");
  35.                 freq.add(wordFrequencies.get(word));
  36.             }
  37.             else {
  38.                 sb.append(0 + ", ");
  39.                 freq.add(0);
  40.             }
  41.         });
  42.         sb.deleteCharAt(sb.lastIndexOf(","));
  43.         sb.deleteCharAt(sb.lastIndexOf(" "));
  44.         sb.append("]");
  45.         return sb.toString();
  46.     }
  47.  
  48.     public int getOccurencies(String word){
  49.         return wordFrequencies.getOrDefault(word, 0);
  50.     }
  51.  
  52.     public List<Integer> frequencies(){
  53.         return freq;
  54.     }
  55.  
  56.     @Override
  57.     public String toString() {
  58.         return original;
  59.     }
  60. }
  61.  
  62. class TextProcessor{
  63.     List<Text> textLines;
  64.     Set<String> words;
  65.     Map<String, Integer> map;
  66.  
  67.     public TextProcessor() {
  68.         this.textLines = new ArrayList<>();
  69.         this.words = new TreeSet<>(String::compareTo);
  70.         this.map = new TreeMap<>();
  71.     }
  72.  
  73.     public void readText(InputStream is){
  74.         Scanner in = new Scanner(System.in);
  75.  
  76.         while (in.hasNextLine()){
  77.             String line = in.nextLine();
  78.             textLines.add(new Text(line));
  79.             line = line.replaceAll("[^A-Za-z\\s+]", "");
  80.             line = line.toLowerCase();
  81.  
  82.             String [] parts = line.split("\\s+");
  83.             words.addAll(Arrays.asList(parts));
  84.         }
  85.         fillTheMap();
  86.     }
  87.  
  88.     public void fillTheMap(){
  89.         words.forEach(word -> {
  90.             textLines.forEach(text -> {
  91.                 map.computeIfAbsent(word, (k) -> map.put(word, 0));
  92.                 map.put(word, map.get(word)+text.getOccurencies(word));
  93.             });
  94.  
  95.         });
  96.     }
  97.  
  98.     public void printTextsVectors(OutputStream os){
  99.         textLines.forEach(text -> {
  100.             System.out.println(text.getFreq(words));
  101.         });
  102.     }
  103.  
  104.     public void printCorpus(OutputStream os, int n, boolean ascending){
  105.         Comparator<Integer> comparatorAscending = Comparator.comparingInt(Integer::intValue);
  106.         Comparator<Integer> comparatorDescending = Comparator.comparingInt(Integer::intValue).reversed();
  107.  
  108.         Comparator<Integer> comparator = ascending ? comparatorAscending : comparatorDescending;
  109.        
  110.         map.entrySet().stream().sorted(Map.Entry.comparingByValue(comparator))
  111.                 .limit(n)
  112.                 .forEach(entry -> System.out.println(entry.getKey()+" : "+entry.getValue()));
  113.     }
  114.  
  115.     public void mostSimilarTexts(OutputStream os){
  116.         double maxSimilarity = 0;
  117.         int maxI = 0;
  118.         int maxJ = 0;
  119.         for(int i=0; i<textLines.size()-1; i++){
  120.             for (int j=0; j<textLines.size(); j++){
  121.                 if (i!=j){
  122.                     double similarity = CosineSimilarityCalculator.cosineSimilarity(textLines.get(i).frequencies(), textLines.get(j).frequencies());
  123.                     if (similarity > maxSimilarity){
  124.                         maxSimilarity = similarity;
  125.                         maxI = i;
  126.                         maxJ = j;
  127.                     }
  128.                 }
  129.             }
  130.         }
  131.         System.out.println(textLines.get(maxI));
  132.         System.out.println(textLines.get(maxJ));
  133.         System.out.println(String.format("%.10f", maxSimilarity));
  134.     }
  135. }
  136.  
  137.  
  138. class CosineSimilarityCalculator {
  139.     public static double cosineSimilarity (Collection<Integer> c1, Collection<Integer> c2) {
  140.         int [] array1;
  141.         int [] array2;
  142.         array1 = c1.stream().mapToInt(i -> i).toArray();
  143.         array2 = c2.stream().mapToInt(i -> i).toArray();
  144.         double up = 0.0;
  145.         double down1=0, down2=0;
  146.  
  147.         for (int i=0;i<c1.size();i++) {
  148.             up+=(array1[i] * array2[i]);
  149.         }
  150.  
  151.         for (int i=0;i<c1.size();i++) {
  152.             down1+=(array1[i]*array1[i]);
  153.         }
  154.  
  155.         for (int i=0;i<c1.size();i++) {
  156.             down2+=(array2[i]*array2[i]);
  157.         }
  158.  
  159.         return up/(Math.sqrt(down1)*Math.sqrt(down2));
  160.     }
  161. }
  162.  
  163. public class TextProcessorTest {
  164.  
  165.     public static void main(String[] args) {
  166.         TextProcessor textProcessor = new TextProcessor();
  167.  
  168.         textProcessor.readText(System.in);
  169.  
  170.         System.out.println("===PRINT VECTORS===");
  171.         textProcessor.printTextsVectors(System.out);
  172.  
  173.         System.out.println("PRINT FIRST 20 WORDS SORTED ASCENDING BY FREQUENCY ");
  174.        textProcessor.printCorpus(System.out,  20, true);
  175.  
  176.         System.out.println("PRINT FIRST 20 WORDS SORTED DESCENDING BY FREQUENCY");
  177.         textProcessor.printCorpus(System.out, 20, false);
  178.  
  179.         System.out.println("===MOST SIMILAR TEXTS===");
  180.         textProcessor.mostSimilarTexts(System.out);
  181.     }
  182. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement