Advertisement
Guest User

Untitled

a guest
Dec 13th, 2018
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.13 KB | None | 0 0
  1. import java.util.*;
  2. import java.io.IOException;
  3. import java.io.UnsupportedEncodingException;
  4. import java.nio.file.Files;
  5. import java.nio.file.Path;
  6. import java.nio.file.Paths;
  7. import org.omg.CORBA.IntHolder;
  8.  
  9. public class Konkordans {
  10. final String INDEX1_PATH = "/dev/shm/index1.txt";
  11. byte[] index1;
  12.  
  13. final String INDEX2_PATH = "/dev/shm/index2.txt";
  14. byte[] index2;
  15.  
  16. final String INDEX3_PATH = "/dev/shm/lazyindex.txt";
  17. byte[] index3;
  18.  
  19. final String KORPUS_PATH = "/dev/shm/realkorpus.txt";
  20. byte[] korpus;
  21.  
  22. static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzåäö";
  23.  
  24. final int SEGMENT_SIZE = 4;
  25. final int CONTEXT_LENGTH = 60;
  26. final int COUNTER_WIDTH = 6;
  27. final int WARNING_LIMIT = 25;
  28. final String SEPARATOR = ". ";
  29.  
  30. public static void main(String [] args){
  31. if (args.length < 1){
  32. System.out.println("You need a search term!");
  33. return;
  34. }
  35. else {
  36. for(char ch : args[0].toCharArray()){
  37. if (!ALPHABET.contains(ch + "")){
  38. System.out.println("Invalid character in search term: " + ch);
  39. return;
  40. }
  41. }
  42. new Konkordans().search(args[0]);
  43. }
  44. }
  45.  
  46. public Konkordans() {
  47. try {
  48. index1 = Files.readAllBytes(Paths.get(INDEX1_PATH));
  49. index2 = Files.readAllBytes(Paths.get(INDEX2_PATH));
  50. index3 = Files.readAllBytes(Paths.get(INDEX3_PATH));
  51. korpus = Files.readAllBytes(Paths.get(KORPUS_PATH));
  52. }
  53. catch(IOException e) {
  54. System.out.println("Could not find index files!");
  55. }
  56. }
  57.  
  58. public void search(String searchTerm) {
  59. int count = 0;
  60.  
  61. int[] occurrences = findOccurrences(searchTerm);
  62. for(int occurrence : occurrences){
  63. count++;
  64.  
  65. if (count == WARNING_LIMIT+1) {
  66. System.out.println("There are a lot of occurrences (" + occurrences.length + ">" + WARNING_LIMIT + "). Do you want to continue?");
  67. try { System.in.read(); } catch(Exception e) {}
  68. }
  69.  
  70. int start = Math.max(0, occurrence - CONTEXT_LENGTH/2),
  71. stop = Math.min(occurrence + searchTerm.length() + CONTEXT_LENGTH/2, korpus.length);
  72.  
  73. byte[] context = new byte[stop - start];
  74. for(int i = start; i < stop; i++)
  75. context[i-start] = korpus[i];
  76.  
  77. try {
  78. System.out.format("%" + COUNTER_WIDTH + "d", count);
  79. System.out.println(SEPARATOR + new String(context, "ISO-8859-1").replace("\n", " "));
  80. }
  81. catch(UnsupportedEncodingException e){
  82. System.out.println("error");
  83. break;
  84. }
  85.  
  86. for(int i = 0; i < (COUNTER_WIDTH+SEPARATOR.length()) + CONTEXT_LENGTH/2; i++)
  87. System.out.print(" ");
  88.  
  89. for(int i = 0; i < searchTerm.length(); i++)
  90. System.out.print("^");
  91. System.out.println("");
  92. }
  93. }
  94.  
  95. int[] findOccurrences(String needle) {
  96. byte[] buffer = new byte[SEGMENT_SIZE];
  97.  
  98. int pos_lazy = hash(needle.substring(0, Math.min(needle.length(), 3))) * SEGMENT_SIZE;
  99.  
  100. for(int i = 0; i < SEGMENT_SIZE; i++)
  101. buffer[i] = index3[pos_lazy + i];
  102. IntHolder low = new IntHolder(intFromBytes(buffer) / SEGMENT_SIZE);
  103.  
  104. for(int i = 0; i < SEGMENT_SIZE; i++)
  105. buffer[i] = index3[pos_lazy + SEGMENT_SIZE + i];
  106. IntHolder high = new IntHolder(intFromBytes(buffer) / SEGMENT_SIZE);
  107.  
  108. binarySearch(needle, low, high);
  109.  
  110. System.out.println("low = " + low.value);
  111. System.out.println("high = " + high.value);
  112.  
  113. int wordPos = linearSearch(needle, low.value-2, high.value);
  114.  
  115. int size = 0;
  116.  
  117. if(wordPos != -1) {
  118. while(index1[wordPos] != ' ')
  119. wordPos++;
  120. wordPos++;
  121.  
  122. for(int i = 0; i < SEGMENT_SIZE; i++)
  123. buffer[i] = index1[wordPos + i];
  124.  
  125. size = intFromBytes(buffer);
  126.  
  127. int[] occurrences = new int[size];
  128. for(int seg = 0; seg < size; seg++){
  129. for(int j = 0; j < SEGMENT_SIZE; j++)
  130. buffer[j] = index1[wordPos + (1+seg)*SEGMENT_SIZE + j];
  131.  
  132. occurrences[seg] = intFromBytes(buffer);
  133. }
  134.  
  135. return occurrences;
  136. }
  137.  
  138. return new int[0];
  139. }
  140.  
  141. int linearSearch(String needle, int low, int high) {
  142. List<Integer> wordPosition = new ArrayList<Integer>();
  143. for(int i = low * SEGMENT_SIZE; i <= high*SEGMENT_SIZE; i += SEGMENT_SIZE) {
  144. byte[] slice = Arrays.copyOfRange(index2, i, i + SEGMENT_SIZE);
  145. wordPosition.add(intFromBytes(slice));
  146. }
  147.  
  148. for(int pos : wordPosition) {
  149. byte[] wordBytes = new byte[60];
  150. int i = 0;
  151. for(i = 0; index1[pos + i] != ' ' && i < wordBytes.length; i++)
  152. wordBytes[i] = index1[pos + i];
  153.  
  154. try {
  155. String word = new String(Arrays.copyOfRange(wordBytes, 0, i), "ISO-8859-1");
  156.  
  157. if(needle.compareTo(word) == 0){
  158. System.out.println("Found: " + word);
  159. return pos;
  160. }
  161. }
  162. catch(UnsupportedEncodingException e) {
  163. System.out.println("error");
  164. }
  165. }
  166.  
  167. return -1;
  168. }
  169.  
  170. void binarySearch(String needle, IntHolder low, IntHolder high) {
  171. while ((high.value - low.value)*SEGMENT_SIZE > 20) {
  172. int mid = (high.value + low.value) / 2;
  173. byte[] buffer = new byte[SEGMENT_SIZE];
  174.  
  175. for(int i = 0; i < SEGMENT_SIZE; i++)
  176. buffer[i] = index2[mid*SEGMENT_SIZE+i];
  177.  
  178. int pos_index1 = intFromBytes(buffer);
  179.  
  180. byte[] wordBytes = new byte[60];
  181. for(int i = 0; index1[pos_index1 + i] != ' ' && i < wordBytes.length; i++)
  182. wordBytes[i] = index1[pos_index1 + i];
  183.  
  184. try {
  185. String middleWord = new String(wordBytes, "ISO-8859-1");
  186.  
  187. int cmp = needle.compareTo(middleWord);
  188. if (cmp > 0)
  189. low.value = mid;
  190. else if(cmp < 0)
  191. high.value = mid;
  192. else
  193. return;
  194.  
  195. }
  196. catch(UnsupportedEncodingException e) {
  197. System.out.println("error");
  198. }
  199.  
  200. }
  201. }
  202.  
  203. public int hash(String str){
  204. int value = 0;
  205.  
  206. for(int i = 0; i < str.length(); i++) {
  207. value *= 29;
  208. value += ALPHABET.indexOf(str.charAt(i));
  209.  
  210. }
  211.  
  212. return value;
  213. }
  214.  
  215. int intFromBytes(byte[] bytes) {
  216. return bytes[3] << 24 | (bytes[2] & 0xFF) << 16 | (bytes[1] & 0xFF) << 8 | (bytes[0] & 0xFF);
  217. }
  218. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement