Guest User

Untitled

a guest
May 25th, 2018
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 16.12 KB | None | 0 0
  1. Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java
  2. ===================================================================
  3. --- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java (revision 1232218)
  4. +++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java (working copy)
  5. @@ -61,7 +61,7 @@
  6. assertTrue(wordId > lastWordId);
  7. lastWordId = wordId;
  8.  
  9. - String baseForm = tid.getBaseForm(wordId);
  10. + String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
  11. assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
  12.  
  13. String inflectionForm = tid.getInflectionForm(wordId);
  14. @@ -91,11 +91,11 @@
  15. // check that its actually an ipadic pos tag
  16. assertNotNull(ToStringUtil.getPOSTranslation(pos));
  17.  
  18. - String pronunciation = tid.getPronunciation(wordId);
  19. + String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
  20. assertNotNull(pronunciation);
  21. assertTrue(UnicodeUtil.validUTF16String(pronunciation));
  22.  
  23. - String reading = tid.getReading(wordId);
  24. + String reading = tid.getReading(wordId, chars, 0, chars.length);
  25. assertNotNull(reading);
  26. assertTrue(UnicodeUtil.validUTF16String(reading));
  27. }
  28. Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
  29. ===================================================================
  30. --- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (revision 1232218)
  31. +++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (working copy)
  32. @@ -73,12 +73,12 @@
  33. int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
  34. assertEquals(3, result.length);
  35. int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
  36. - assertEquals("ニホン", dictionary.getReading(wordIdNihon));
  37. + assertEquals("ニホン", dictionary.getReading(wordIdNihon, "日本".toCharArray(), 0, 2));
  38.  
  39. result = dictionary.lookup("朝青龍".toCharArray(), 0, 3);
  40. assertEquals(1, result.length);
  41. int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍
  42. - assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
  43. + assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3));
  44. }
  45.  
  46. @Test
  47. Index: modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
  48. ===================================================================
  49. --- modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (revision 1232218)
  50. +++ modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (working copy)
  51. @@ -103,12 +103,15 @@
  52. if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
  53. flags |= BinaryDictionary.HAS_BASEFORM;
  54. }
  55. + if (!reading.equals(toKatakana(entry[0]))) {
  56. + flags |= BinaryDictionary.HAS_READING;
  57. + }
  58. if (!pronunciation.equals(reading)) {
  59. flags |= BinaryDictionary.HAS_PRONUNCIATION;
  60. }
  61.  
  62. assert leftId == rightId;
  63. - assert leftId < 8192; // there are still unused bits
  64. + assert leftId < 4096; // there are still unused bits
  65. // add pos mapping
  66. int toFill = 1+leftId - posDict.size();
  67. for (int i = 0; i < toFill; i++) {
  68. @@ -119,27 +122,36 @@
  69. assert existing == null || existing.equals(fullPOSData);
  70. posDict.set(leftId, fullPOSData);
  71.  
  72. - buffer.putShort((short)(leftId << 2 | flags));
  73. + buffer.putShort((short)(leftId << 3 | flags));
  74. buffer.putShort(wordCost);
  75.  
  76. if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
  77. - buffer.put((byte) baseForm.length());
  78. - for (int i = 0; i < baseForm.length(); i++) {
  79. + assert baseForm.length() < 16;
  80. + int shared = sharedPrefix(entry[0], baseForm);
  81. + int suffix = baseForm.length() - shared;
  82. + buffer.put((byte) (shared << 4 | suffix));
  83. + for (int i = shared; i < baseForm.length(); i++) {
  84. buffer.putChar(baseForm.charAt(i));
  85. }
  86. }
  87.  
  88. - if (isKatakana(reading)) {
  89. - buffer.put((byte) (reading.length() << 1 | 1));
  90. - writeKatakana(reading);
  91. - } else {
  92. - buffer.put((byte) (reading.length() << 1));
  93. - for (int i = 0; i < reading.length(); i++) {
  94. - buffer.putChar(reading.charAt(i));
  95. + if ((flags & BinaryDictionary.HAS_READING) != 0) {
  96. + if (isKatakana(reading)) {
  97. + buffer.put((byte) (reading.length() << 1 | 1));
  98. + writeKatakana(reading);
  99. + } else {
  100. + buffer.put((byte) (reading.length() << 1));
  101. + for (int i = 0; i < reading.length(); i++) {
  102. + buffer.putChar(reading.charAt(i));
  103. + }
  104. }
  105. }
  106.  
  107. if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
  108. + // we can save 150KB here, but it makes the reader a little complicated.
  109. + // int shared = sharedPrefix(reading, pronunciation);
  110. + // buffer.put((byte) shared);
  111. + // pronunciation = pronunciation.substring(shared);
  112. if (isKatakana(pronunciation)) {
  113. buffer.put((byte) (pronunciation.length() << 1 | 1));
  114. writeKatakana(pronunciation);
  115. @@ -170,6 +182,27 @@
  116. }
  117. }
  118.  
  119. + private String toKatakana(String s) {
  120. + char text[] = new char[s.length()];
  121. + for (int i = 0; i < s.length(); i++) {
  122. + char ch = s.charAt(i);
  123. + if (ch > 0x3040 && ch < 0x3097) {
  124. + text[i] = (char)(ch + 0x60);
  125. + } else {
  126. + text[i] = ch;
  127. + }
  128. + }
  129. + return new String(text);
  130. + }
  131. +
  132. + public static int sharedPrefix(String left, String right) {
  133. + int len = left.length() < right.length() ? left.length() : right.length();
  134. + for (int i = 0; i < len; i++)
  135. + if (left.charAt(i) != right.charAt(i))
  136. + return i;
  137. + return len;
  138. + }
  139. +
  140. public void addMapping(int sourceId, int wordId) {
  141. assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
  142.  
  143. Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
  144. ===================================================================
  145. --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (revision 1232218)
  146. +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (working copy)
  147. @@ -196,7 +196,7 @@
  148. }
  149.  
  150. @Override
  151. - public String getReading(int wordId) {
  152. + public String getReading(int wordId, char surface[], int off, int len) {
  153. return getFeature(wordId, 0);
  154. }
  155.  
  156. @@ -206,12 +206,12 @@
  157. }
  158.  
  159. @Override
  160. - public String getBaseForm(int wordId) {
  161. + public String getBaseForm(int wordId, char surface[], int off, int len) {
  162. return null; // TODO: add support?
  163. }
  164.  
  165. @Override
  166. - public String getPronunciation(int wordId) {
  167. + public String getPronunciation(int wordId, char surface[], int off, int len) {
  168. return null; // TODO: add support?
  169. }
  170.  
  171. Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
  172. ===================================================================
  173. --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (revision 1232218)
  174. +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (working copy)
  175. @@ -54,21 +54,21 @@
  176. * @param wordId word ID of token
  177. * @return Reading of the token
  178. */
  179. - public String getReading(int wordId);
  180. + public String getReading(int wordId, char surface[], int off, int len);
  181.  
  182. /**
  183. * Get base form of word
  184. * @param wordId word ID of token
  185. * @return Base form (only different for inflected words, otherwise null)
  186. */
  187. - public String getBaseForm(int wordId);
  188. + public String getBaseForm(int wordId, char surface[], int off, int len);
  189.  
  190. /**
  191. * Get pronunciation of tokens
  192. * @param wordId word ID of token
  193. * @return Pronunciation of the token
  194. */
  195. - public String getPronunciation(int wordId);
  196. + public String getPronunciation(int wordId, char surface[], int off, int len);
  197.  
  198. /**
  199. * Get inflection type of tokens
  200. Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
  201. ===================================================================
  202. --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (revision 1232218)
  203. +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (working copy)
  204. @@ -149,12 +149,12 @@
  205.  
  206. @Override
  207. public int getLeftId(int wordId) {
  208. - return buffer.getShort(wordId) >>> 2;
  209. + return buffer.getShort(wordId) >>> 3;
  210. }
  211.  
  212. @Override
  213. public int getRightId(int wordId) {
  214. - return buffer.getShort(wordId) >>> 2;
  215. + return buffer.getShort(wordId) >>> 3;
  216. }
  217.  
  218. @Override
  219. @@ -163,21 +163,42 @@
  220. }
  221.  
  222. @Override
  223. - public String getBaseForm(int wordId) {
  224. + public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
  225. if (hasBaseFormData(wordId)) {
  226. int offset = baseFormOffset(wordId);
  227. - int length = buffer.get(offset++) & 0xff;
  228. - return readString(offset, length, false);
  229. + int data = buffer.get(offset++) & 0xff;
  230. + int prefix = data >>> 4;
  231. + int suffix = data & 0xF;
  232. + char text[] = new char[prefix+suffix];
  233. + System.arraycopy(surfaceForm, off, text, 0, prefix);
  234. + for (int i = 0; i < suffix; i++) {
  235. + text[prefix+i] = buffer.getChar(offset + (i << 1));
  236. + }
  237. + return new String(text);
  238. } else {
  239. return null;
  240. }
  241. }
  242.  
  243. @Override
  244. - public String getReading(int wordId) {
  245. - int offset = readingOffset(wordId);
  246. - int readingData = buffer.get(offset++) & 0xff;
  247. - return readString(offset, readingData >>> 1, (readingData & 1) == 1);
  248. + public String getReading(int wordId, char surface[], int off, int len) {
  249. + if (hasReadingData(wordId)) {
  250. + int offset = readingOffset(wordId);
  251. + int readingData = buffer.get(offset++) & 0xff;
  252. + return readString(offset, readingData >>> 1, (readingData & 1) == 1);
  253. + } else {
  254. + // the reading is the surface form, with hiragana shifted to katakana
  255. + char text[] = new char[len];
  256. + for (int i = 0; i < len; i++) {
  257. + char ch = surface[off+i];
  258. + if (ch > 0x3040 && ch < 0x3097) {
  259. + text[i] = (char)(ch + 0x60);
  260. + } else {
  261. + text[i] = ch;
  262. + }
  263. + }
  264. + return new String(text);
  265. + }
  266. }
  267.  
  268. @Override
  269. @@ -186,13 +207,13 @@
  270. }
  271.  
  272. @Override
  273. - public String getPronunciation(int wordId) {
  274. + public String getPronunciation(int wordId, char surface[], int off, int len) {
  275. if (hasPronunciationData(wordId)) {
  276. int offset = pronunciationOffset(wordId);
  277. int pronunciationData = buffer.get(offset++) & 0xff;
  278. return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
  279. } else {
  280. - return getReading(wordId); // same as the reading
  281. + return getReading(wordId, surface, off, len); // same as the reading
  282. }
  283. }
  284.  
  285. @@ -213,7 +234,7 @@
  286. private int readingOffset(int wordId) {
  287. int offset = baseFormOffset(wordId);
  288. if (hasBaseFormData(wordId)) {
  289. - int baseFormLength = buffer.get(offset++) & 0xff;
  290. + int baseFormLength = buffer.get(offset++) & 0xf;
  291. return offset + (baseFormLength << 1);
  292. } else {
  293. return offset;
  294. @@ -221,21 +242,29 @@
  295. }
  296.  
  297. private int pronunciationOffset(int wordId) {
  298. - int offset = readingOffset(wordId);
  299. - int readingData = buffer.get(offset++) & 0xff;
  300. - final int readingLength;
  301. - if ((readingData & 1) == 0) {
  302. - readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
  303. + if (hasReadingData(wordId)) {
  304. + int offset = readingOffset(wordId);
  305. + int readingData = buffer.get(offset++) & 0xff;
  306. + final int readingLength;
  307. + if ((readingData & 1) == 0) {
  308. + readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
  309. + } else {
  310. + readingLength = readingData >>> 1;
  311. + }
  312. + return offset + readingLength;
  313. } else {
  314. - readingLength = readingData >>> 1;
  315. + return readingOffset(wordId);
  316. }
  317. - return offset + readingLength;
  318. }
  319.  
  320. private boolean hasBaseFormData(int wordId) {
  321. return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
  322. }
  323.  
  324. + private boolean hasReadingData(int wordId) {
  325. + return (buffer.getShort(wordId) & HAS_READING) != 0;
  326. + }
  327. +
  328. private boolean hasPronunciationData(int wordId) {
  329. return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
  330. }
  331. @@ -256,6 +285,8 @@
  332.  
  333. /** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
  334. public static final int HAS_BASEFORM = 1;
  335. + /** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
  336. + public static final int HAS_READING = 2;
  337. /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
  338. - public static final int HAS_PRONUNCIATION = 2;
  339. + public static final int HAS_PRONUNCIATION = 4;
  340. }
  341. Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
  342. ===================================================================
  343. --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (revision 1232218)
  344. +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (working copy)
  345. @@ -51,7 +51,7 @@
  346. }
  347.  
  348. @Override
  349. - public String getReading(int wordId) {
  350. + public String getReading(int wordId, char surface[], int off, int len) {
  351. return null;
  352. }
  353.  
  354. Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
  355. ===================================================================
  356. --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (revision 1232218)
  357. +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (working copy)
  358. @@ -75,14 +75,14 @@
  359. * @return reading. null if token doesn't have reading.
  360. */
  361. public String getReading() {
  362. - return dictionary.getReading(wordId);
  363. + return dictionary.getReading(wordId, surfaceForm, offset, length);
  364. }
  365.  
  366. /**
  367. * @return pronunciation. null if token doesn't have pronunciation.
  368. */
  369. public String getPronunciation() {
  370. - return dictionary.getPronunciation(wordId);
  371. + return dictionary.getPronunciation(wordId, surfaceForm, offset, length);
  372. }
  373.  
  374. /**
  375. @@ -110,7 +110,7 @@
  376. * @return base form or null if token is not inflected
  377. */
  378. public String getBaseForm() {
  379. - return dictionary.getBaseForm(wordId);
  380. + return dictionary.getBaseForm(wordId, surfaceForm, offset, length);
  381. }
  382.  
  383. /**
  384. Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
  385. ===================================================================
  386. Cannot display: file marked as a binary type.
  387. svn:mime-type = application/octet-stream
  388. Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
  389. ===================================================================
  390. Cannot display: file marked as a binary type.
  391. svn:mime-type = application/octet-stream
  392. Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
  393. ===================================================================
  394. Cannot display: file marked as a binary type.
  395. svn:mime-type = application/octet-stream
Add Comment
Please, Sign In to add comment