Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.io.*;
- import java.util.Arrays;
- public class DialogEncoder {
- private final static String analysisFileName = "analysis/war_utf8.txt";
- private final static String ALPHABET = "абвгдеёжзийкламнопрстуфхцчшщъыьэюя";
- private final static boolean DEBUG = true;
- private final static String [] ENCODINGS = new String [] {
- "cp866",
- "cp1251",
- "koi8-r",
- "iso-8859-5",
- "utf-8",
- "utf-16le",
- "utf-16be",
- };
- private double [] simpleFrequencyVector;
- public DialogEncoder(String analysisFileName) throws
- IOException
- {
- FileReader reader = new FileReader(analysisFileName);
- StringBuilder text = new StringBuilder();
- while(reader.ready())
- {
- char ch = (char)reader.read();
- text.append(ch);
- }
- reader.close();
- simpleFrequencyVector = getFrequencyVector(text.toString());
- }
- private double [] getFrequencyVector(String text)
- {
- return getFrequencyVector(text.toCharArray());
- }
- private double [] getFrequencyVector(char [] text)
- {
- double [] frequencyVector = new double [ALPHABET.length()];
- int sum = 0;
- for(char ch : text)
- {
- int idx = ALPHABET.indexOf(Character.toLowerCase(ch));
- if(idx != -1)
- {
- sum++;
- frequencyVector[idx]++;
- }
- }
- for(int i = 0; i < ALPHABET.length(); i++)
- {
- frequencyVector[i] /= sum;
- }
- return frequencyVector;
- }
- private double getDist(double [] first, double [] second)
- {
- double result = 0;
- for(int i = 0; i < ALPHABET.length(); i++)
- {
- double value = first[i] - second[i];
- result += value * value;
- }
- return Math.sqrt(result);
- }
- private char [] encodeData(byte [] data) throws
- IOException
- {
- double minDist = Double.MAX_VALUE;
- String minEncoding = "";
- byte [] buff = new byte [data.length];
- for(int i = 0, j = 0; i < data.length; i++)
- {
- if(data[i] != '\n')
- {
- buff[j++] = data[i];
- }
- }
- for(String encoding : ENCODINGS)
- {
- String text = new String(buff, encoding);
- double [] frequencyVector = getFrequencyVector(text);
- double dist = getDist(frequencyVector, simpleFrequencyVector);
- if(dist < minDist)
- {
- minDist = dist;
- minEncoding = encoding;
- }
- }
- if(DEBUG) {
- System.out.println(minEncoding);
- }
- StringBuilder result = new StringBuilder();
- for(int i = 0, j = 0; i < data.length; i++)
- {
- if(data[i] != '\n' || i + 1 == data.length)
- {
- buff[j++] = data[i];
- }
- else
- {
- result.append(new String(Arrays.copyOf(buff, j), minEncoding));
- result.append('\n');
- j = 0;
- }
- }
- return result.toString().toCharArray();
- }
- private char [] mergeDialogs(char [] first, char [] second)
- {
- char [] result = new char[first.length + second.length];
- int i = 0, j = 0, k = 0;
- while(i < first.length || j < second.length)
- {
- if(k == 0 && i < first.length)
- {
- result[i + j] = first[i];
- if(first[i] == '\n')
- k = 1;
- i++;
- }
- else if(j < second.length)
- {
- result[i + j] = second[j];
- if(second[j] == '\n')
- k = 0;
- j++;
- }
- else
- {
- k = 1 - k;
- }
- }
- return result;
- }
- private char [] cutZeros(char [] text)
- {
- int idx = 0;
- for(; idx < text.length; idx++)
- {
- if(text[idx] == 0)
- break;
- }
- return Arrays.copyOf(text, idx);
- }
- public void encode(String inputFileName, String outputFileName) throws
- IOException
- {
- File inputFile = new File(inputFileName);
- int fileLength = (int)inputFile.length();
- byte [] data = new byte [fileLength];
- byte [][] dialogs = new byte [][] {
- new byte [fileLength],
- new byte [fileLength],
- };
- int [] dialogsSizes = new int [] {0, 0};
- FileInputStream fileInputStream = new FileInputStream(inputFile);
- fileInputStream.read(data);
- fileInputStream.close();
- int dialogIdx = 0;
- for(byte b : data)
- {
- int currentDialogSize = dialogsSizes[dialogIdx];
- dialogs[dialogIdx][currentDialogSize] = b;
- dialogsSizes[dialogIdx]++;
- if(b == '\n')
- dialogIdx = 1 - dialogIdx;
- }
- char [] encodedData = cutZeros(mergeDialogs(encodeData(dialogs[0]), encodeData(dialogs[1])));
- FileWriter writer = new FileWriter(outputFileName);
- writer.write(encodedData);
- writer.close();
- }
- public static void main(String [] args) throws Exception
- {
- DialogEncoder encoder = new DialogEncoder(analysisFileName);
- String inputFileName;
- String outputFileName;
- try {
- inputFileName = args[0];
- outputFileName = args[1];
- } catch(ArrayIndexOutOfBoundsException ex) {
- System.out.println("java DialogEncoder [input_file_name] [output_file_name]");
- return;
- }
- encoder.encode(inputFileName, outputFileName);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement