Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java
- ===================================================================
- --- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java (working copy)
- @@ -61,7 +61,7 @@
- assertTrue(wordId > lastWordId);
- lastWordId = wordId;
- - String baseForm = tid.getBaseForm(wordId);
- + String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
- assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
- String inflectionForm = tid.getInflectionForm(wordId);
- @@ -91,11 +91,11 @@
- // check that its actually an ipadic pos tag
- assertNotNull(ToStringUtil.getPOSTranslation(pos));
- - String pronunciation = tid.getPronunciation(wordId);
- + String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
- assertNotNull(pronunciation);
- assertTrue(UnicodeUtil.validUTF16String(pronunciation));
- - String reading = tid.getReading(wordId);
- + String reading = tid.getReading(wordId, chars, 0, chars.length);
- assertNotNull(reading);
- assertTrue(UnicodeUtil.validUTF16String(reading));
- }
- Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
- ===================================================================
- --- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java (working copy)
- @@ -73,12 +73,12 @@
- int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
- assertEquals(3, result.length);
- int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
- - assertEquals("ニホン", dictionary.getReading(wordIdNihon));
- + assertEquals("ニホン", dictionary.getReading(wordIdNihon, "日本".toCharArray(), 0, 2));
- result = dictionary.lookup("朝青龍".toCharArray(), 0, 3);
- assertEquals(1, result.length);
- int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍
- - assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
- + assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3));
- }
- @Test
- Index: modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
- ===================================================================
- --- modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java (working copy)
- @@ -103,12 +103,15 @@
- if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
- flags |= BinaryDictionary.HAS_BASEFORM;
- }
- + if (!reading.equals(toKatakana(entry[0]))) {
- + flags |= BinaryDictionary.HAS_READING;
- + }
- if (!pronunciation.equals(reading)) {
- flags |= BinaryDictionary.HAS_PRONUNCIATION;
- }
- assert leftId == rightId;
- - assert leftId < 8192; // there are still unused bits
- + assert leftId < 4096; // there are still unused bits
- // add pos mapping
- int toFill = 1+leftId - posDict.size();
- for (int i = 0; i < toFill; i++) {
- @@ -119,27 +122,36 @@
- assert existing == null || existing.equals(fullPOSData);
- posDict.set(leftId, fullPOSData);
- - buffer.putShort((short)(leftId << 2 | flags));
- + buffer.putShort((short)(leftId << 3 | flags));
- buffer.putShort(wordCost);
- if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
- - buffer.put((byte) baseForm.length());
- - for (int i = 0; i < baseForm.length(); i++) {
- + assert baseForm.length() < 16;
- + int shared = sharedPrefix(entry[0], baseForm);
- + int suffix = baseForm.length() - shared;
- + buffer.put((byte) (shared << 4 | suffix));
- + for (int i = shared; i < baseForm.length(); i++) {
- buffer.putChar(baseForm.charAt(i));
- }
- }
- - if (isKatakana(reading)) {
- - buffer.put((byte) (reading.length() << 1 | 1));
- - writeKatakana(reading);
- - } else {
- - buffer.put((byte) (reading.length() << 1));
- - for (int i = 0; i < reading.length(); i++) {
- - buffer.putChar(reading.charAt(i));
- + if ((flags & BinaryDictionary.HAS_READING) != 0) {
- + if (isKatakana(reading)) {
- + buffer.put((byte) (reading.length() << 1 | 1));
- + writeKatakana(reading);
- + } else {
- + buffer.put((byte) (reading.length() << 1));
- + for (int i = 0; i < reading.length(); i++) {
- + buffer.putChar(reading.charAt(i));
- + }
- }
- }
- if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
- + // we can save 150KB here, but it makes the reader a little complicated.
- + // int shared = sharedPrefix(reading, pronunciation);
- + // buffer.put((byte) shared);
- + // pronunciation = pronunciation.substring(shared);
- if (isKatakana(pronunciation)) {
- buffer.put((byte) (pronunciation.length() << 1 | 1));
- writeKatakana(pronunciation);
- @@ -170,6 +182,27 @@
- }
- }
- + private String toKatakana(String s) {
- + char text[] = new char[s.length()];
- + for (int i = 0; i < s.length(); i++) {
- + char ch = s.charAt(i);
- + if (ch > 0x3040 && ch < 0x3097) {
- + text[i] = (char)(ch + 0x60);
- + } else {
- + text[i] = ch;
- + }
- + }
- + return new String(text);
- + }
- +
- + public static int sharedPrefix(String left, String right) {
- + int len = left.length() < right.length() ? left.length() : right.length();
- + for (int i = 0; i < len; i++)
- + if (left.charAt(i) != right.charAt(i))
- + return i;
- + return len;
- + }
- +
- public void addMapping(int sourceId, int wordId) {
- assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
- Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
- ===================================================================
- --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java (working copy)
- @@ -196,7 +196,7 @@
- }
- @Override
- - public String getReading(int wordId) {
- + public String getReading(int wordId, char surface[], int off, int len) {
- return getFeature(wordId, 0);
- }
- @@ -206,12 +206,12 @@
- }
- @Override
- - public String getBaseForm(int wordId) {
- + public String getBaseForm(int wordId, char surface[], int off, int len) {
- return null; // TODO: add support?
- }
- @Override
- - public String getPronunciation(int wordId) {
- + public String getPronunciation(int wordId, char surface[], int off, int len) {
- return null; // TODO: add support?
- }
- Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
- ===================================================================
- --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java (working copy)
- @@ -54,21 +54,21 @@
- * @param wordId word ID of token
- * @return Reading of the token
- */
- - public String getReading(int wordId);
- + public String getReading(int wordId, char surface[], int off, int len);
- /**
- * Get base form of word
- * @param wordId word ID of token
- * @return Base form (only different for inflected words, otherwise null)
- */
- - public String getBaseForm(int wordId);
- + public String getBaseForm(int wordId, char surface[], int off, int len);
- /**
- * Get pronunciation of tokens
- * @param wordId word ID of token
- * @return Pronunciation of the token
- */
- - public String getPronunciation(int wordId);
- + public String getPronunciation(int wordId, char surface[], int off, int len);
- /**
- * Get inflection type of tokens
- Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
- ===================================================================
- --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java (working copy)
- @@ -149,12 +149,12 @@
- @Override
- public int getLeftId(int wordId) {
- - return buffer.getShort(wordId) >>> 2;
- + return buffer.getShort(wordId) >>> 3;
- }
- @Override
- public int getRightId(int wordId) {
- - return buffer.getShort(wordId) >>> 2;
- + return buffer.getShort(wordId) >>> 3;
- }
- @Override
- @@ -163,21 +163,42 @@
- }
- @Override
- - public String getBaseForm(int wordId) {
- + public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
- if (hasBaseFormData(wordId)) {
- int offset = baseFormOffset(wordId);
- - int length = buffer.get(offset++) & 0xff;
- - return readString(offset, length, false);
- + int data = buffer.get(offset++) & 0xff;
- + int prefix = data >>> 4;
- + int suffix = data & 0xF;
- + char text[] = new char[prefix+suffix];
- + System.arraycopy(surfaceForm, off, text, 0, prefix);
- + for (int i = 0; i < suffix; i++) {
- + text[prefix+i] = buffer.getChar(offset + (i << 1));
- + }
- + return new String(text);
- } else {
- return null;
- }
- }
- @Override
- - public String getReading(int wordId) {
- - int offset = readingOffset(wordId);
- - int readingData = buffer.get(offset++) & 0xff;
- - return readString(offset, readingData >>> 1, (readingData & 1) == 1);
- + public String getReading(int wordId, char surface[], int off, int len) {
- + if (hasReadingData(wordId)) {
- + int offset = readingOffset(wordId);
- + int readingData = buffer.get(offset++) & 0xff;
- + return readString(offset, readingData >>> 1, (readingData & 1) == 1);
- + } else {
- + // the reading is the surface form, with hiragana shifted to katakana
- + char text[] = new char[len];
- + for (int i = 0; i < len; i++) {
- + char ch = surface[off+i];
- + if (ch > 0x3040 && ch < 0x3097) {
- + text[i] = (char)(ch + 0x60);
- + } else {
- + text[i] = ch;
- + }
- + }
- + return new String(text);
- + }
- }
- @Override
- @@ -186,13 +207,13 @@
- }
- @Override
- - public String getPronunciation(int wordId) {
- + public String getPronunciation(int wordId, char surface[], int off, int len) {
- if (hasPronunciationData(wordId)) {
- int offset = pronunciationOffset(wordId);
- int pronunciationData = buffer.get(offset++) & 0xff;
- return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
- } else {
- - return getReading(wordId); // same as the reading
- + return getReading(wordId, surface, off, len); // same as the reading
- }
- }
- @@ -213,7 +234,7 @@
- private int readingOffset(int wordId) {
- int offset = baseFormOffset(wordId);
- if (hasBaseFormData(wordId)) {
- - int baseFormLength = buffer.get(offset++) & 0xff;
- + int baseFormLength = buffer.get(offset++) & 0xf;
- return offset + (baseFormLength << 1);
- } else {
- return offset;
- @@ -221,21 +242,29 @@
- }
- private int pronunciationOffset(int wordId) {
- - int offset = readingOffset(wordId);
- - int readingData = buffer.get(offset++) & 0xff;
- - final int readingLength;
- - if ((readingData & 1) == 0) {
- - readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
- + if (hasReadingData(wordId)) {
- + int offset = readingOffset(wordId);
- + int readingData = buffer.get(offset++) & 0xff;
- + final int readingLength;
- + if ((readingData & 1) == 0) {
- + readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
- + } else {
- + readingLength = readingData >>> 1;
- + }
- + return offset + readingLength;
- } else {
- - readingLength = readingData >>> 1;
- + return readingOffset(wordId);
- }
- - return offset + readingLength;
- }
- private boolean hasBaseFormData(int wordId) {
- return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
- }
- + private boolean hasReadingData(int wordId) {
- + return (buffer.getShort(wordId) & HAS_READING) != 0;
- + }
- +
- private boolean hasPronunciationData(int wordId) {
- return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
- }
- @@ -256,6 +285,8 @@
- /** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
- public static final int HAS_BASEFORM = 1;
- + /** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
- + public static final int HAS_READING = 2;
- /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
- - public static final int HAS_PRONUNCIATION = 2;
- + public static final int HAS_PRONUNCIATION = 4;
- }
- Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
- ===================================================================
- --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java (working copy)
- @@ -51,7 +51,7 @@
- }
- @Override
- - public String getReading(int wordId) {
- + public String getReading(int wordId, char surface[], int off, int len) {
- return null;
- }
- Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
- ===================================================================
- --- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (revision 1232218)
- +++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java (working copy)
- @@ -75,14 +75,14 @@
- * @return reading. null if token doesn't have reading.
- */
- public String getReading() {
- - return dictionary.getReading(wordId);
- + return dictionary.getReading(wordId, surfaceForm, offset, length);
- }
- /**
- * @return pronunciation. null if token doesn't have pronunciation.
- */
- public String getPronunciation() {
- - return dictionary.getPronunciation(wordId);
- + return dictionary.getPronunciation(wordId, surfaceForm, offset, length);
- }
- /**
- @@ -110,7 +110,7 @@
- * @return base form or null if token is not inflected
- */
- public String getBaseForm() {
- - return dictionary.getBaseForm(wordId);
- + return dictionary.getBaseForm(wordId, surfaceForm, offset, length);
- }
- /**
- Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
- ===================================================================
- Cannot display: file marked as a binary type.
- svn:mime-type = application/octet-stream
- Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
- ===================================================================
- Cannot display: file marked as a binary type.
- svn:mime-type = application/octet-stream
- Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
- ===================================================================
- Cannot display: file marked as a binary type.
- svn:mime-type = application/octet-stream
Add Comment
Please, Sign In to add comment