Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import java.util.*;
- import java.io.IOException;
- import java.io.UnsupportedEncodingException;
- import java.nio.file.Files;
- import java.nio.file.Path;
- import java.nio.file.Paths;
- import org.omg.CORBA.IntHolder;
- public class Konkordans {
- final String INDEX1_PATH = "/dev/shm/index1.txt";
- byte[] index1;
- final String INDEX2_PATH = "/dev/shm/index2.txt";
- byte[] index2;
- final String INDEX3_PATH = "/dev/shm/lazyindex.txt";
- byte[] index3;
- final String KORPUS_PATH = "/dev/shm/realkorpus.txt";
- byte[] korpus;
- static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzåäö";
- final int SEGMENT_SIZE = 4;
- final int CONTEXT_LENGTH = 60;
- final int COUNTER_WIDTH = 6;
- final int WARNING_LIMIT = 25;
- final String SEPARATOR = ". ";
- public static void main(String [] args){
- if (args.length < 1){
- System.out.println("You need a search term!");
- return;
- }
- else {
- for(char ch : args[0].toCharArray()){
- if (!ALPHABET.contains(ch + "")){
- System.out.println("Invalid character in search term: " + ch);
- return;
- }
- }
- new Konkordans().search(args[0]);
- }
- }
- public Konkordans() {
- try {
- index1 = Files.readAllBytes(Paths.get(INDEX1_PATH));
- index2 = Files.readAllBytes(Paths.get(INDEX2_PATH));
- index3 = Files.readAllBytes(Paths.get(INDEX3_PATH));
- korpus = Files.readAllBytes(Paths.get(KORPUS_PATH));
- }
- catch(IOException e) {
- System.out.println("Could not find index files!");
- }
- }
- public void search(String searchTerm) {
- int count = 0;
- int[] occurrences = findOccurrences(searchTerm);
- for(int occurrence : occurrences){
- count++;
- if (count == WARNING_LIMIT+1) {
- System.out.println("There are a lot of occurrences (" + occurrences.length + ">" + WARNING_LIMIT + "). Do you want to continue?");
- try { System.in.read(); } catch(Exception e) {}
- }
- int start = Math.max(0, occurrence - CONTEXT_LENGTH/2),
- stop = Math.min(occurrence + searchTerm.length() + CONTEXT_LENGTH/2, korpus.length);
- byte[] context = new byte[stop - start];
- for(int i = start; i < stop; i++)
- context[i-start] = korpus[i];
- try {
- System.out.format("%" + COUNTER_WIDTH + "d", count);
- System.out.println(SEPARATOR + new String(context, "ISO-8859-1").replace("\n", " "));
- }
- catch(UnsupportedEncodingException e){
- System.out.println("error");
- break;
- }
- for(int i = 0; i < (COUNTER_WIDTH+SEPARATOR.length()) + CONTEXT_LENGTH/2; i++)
- System.out.print(" ");
- for(int i = 0; i < searchTerm.length(); i++)
- System.out.print("^");
- System.out.println("");
- }
- }
- int[] findOccurrences(String needle) {
- byte[] buffer = new byte[SEGMENT_SIZE];
- int pos_lazy = hash(needle.substring(0, Math.min(needle.length(), 3))) * SEGMENT_SIZE;
- for(int i = 0; i < SEGMENT_SIZE; i++)
- buffer[i] = index3[pos_lazy + i];
- IntHolder low = new IntHolder(intFromBytes(buffer) / SEGMENT_SIZE);
- for(int i = 0; i < SEGMENT_SIZE; i++)
- buffer[i] = index3[pos_lazy + SEGMENT_SIZE + i];
- IntHolder high = new IntHolder(intFromBytes(buffer) / SEGMENT_SIZE);
- binarySearch(needle, low, high);
- System.out.println("low = " + low.value);
- System.out.println("high = " + high.value);
- int wordPos = linearSearch(needle, low.value-2, high.value);
- int size = 0;
- if(wordPos != -1) {
- while(index1[wordPos] != ' ')
- wordPos++;
- wordPos++;
- for(int i = 0; i < SEGMENT_SIZE; i++)
- buffer[i] = index1[wordPos + i];
- size = intFromBytes(buffer);
- int[] occurrences = new int[size];
- for(int seg = 0; seg < size; seg++){
- for(int j = 0; j < SEGMENT_SIZE; j++)
- buffer[j] = index1[wordPos + (1+seg)*SEGMENT_SIZE + j];
- occurrences[seg] = intFromBytes(buffer);
- }
- return occurrences;
- }
- return new int[0];
- }
- int linearSearch(String needle, int low, int high) {
- List<Integer> wordPosition = new ArrayList<Integer>();
- for(int i = low * SEGMENT_SIZE; i <= high*SEGMENT_SIZE; i += SEGMENT_SIZE) {
- byte[] slice = Arrays.copyOfRange(index2, i, i + SEGMENT_SIZE);
- wordPosition.add(intFromBytes(slice));
- }
- for(int pos : wordPosition) {
- byte[] wordBytes = new byte[60];
- int i = 0;
- for(i = 0; index1[pos + i] != ' ' && i < wordBytes.length; i++)
- wordBytes[i] = index1[pos + i];
- try {
- String word = new String(Arrays.copyOfRange(wordBytes, 0, i), "ISO-8859-1");
- if(needle.compareTo(word) == 0){
- System.out.println("Found: " + word);
- return pos;
- }
- }
- catch(UnsupportedEncodingException e) {
- System.out.println("error");
- }
- }
- return -1;
- }
- void binarySearch(String needle, IntHolder low, IntHolder high) {
- while ((high.value - low.value)*SEGMENT_SIZE > 20) {
- int mid = (high.value + low.value) / 2;
- byte[] buffer = new byte[SEGMENT_SIZE];
- for(int i = 0; i < SEGMENT_SIZE; i++)
- buffer[i] = index2[mid*SEGMENT_SIZE+i];
- int pos_index1 = intFromBytes(buffer);
- byte[] wordBytes = new byte[60];
- for(int i = 0; index1[pos_index1 + i] != ' ' && i < wordBytes.length; i++)
- wordBytes[i] = index1[pos_index1 + i];
- try {
- String middleWord = new String(wordBytes, "ISO-8859-1");
- int cmp = needle.compareTo(middleWord);
- if (cmp > 0)
- low.value = mid;
- else if(cmp < 0)
- high.value = mid;
- else
- return;
- }
- catch(UnsupportedEncodingException e) {
- System.out.println("error");
- }
- }
- }
- public int hash(String str){
- int value = 0;
- for(int i = 0; i < str.length(); i++) {
- value *= 29;
- value += ALPHABET.indexOf(str.charAt(i));
- }
- return value;
- }
- int intFromBytes(byte[] bytes) {
- return bytes[3] << 24 | (bytes[2] & 0xFF) << 16 | (bytes[1] & 0xFF) << 8 | (bytes[0] & 0xFF);
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement