Untitled

Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/TestTokenInfoDictionary.java	(working copy)
@@ -61,7 +61,7 @@
         assertTrue(wordId > lastWordId);
         lastWordId = wordId;

-        String baseForm = tid.getBaseForm(wordId);
+        String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
         assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));

         String inflectionForm = tid.getInflectionForm(wordId);
@@ -91,11 +91,11 @@
         // check that its actually an ipadic pos tag
         assertNotNull(ToStringUtil.getPOSTranslation(pos));

-        String pronunciation = tid.getPronunciation(wordId);
+        String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
         assertNotNull(pronunciation);
         assertTrue(UnicodeUtil.validUTF16String(pronunciation));

-        String reading = tid.getReading(wordId);
+        String reading = tid.getReading(wordId, chars, 0, chars.length);
         assertNotNull(reading);
         assertTrue(UnicodeUtil.validUTF16String(reading));
       }
Index: modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java
===================================================================
--- modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/test/org/apache/lucene/analysis/kuromoji/dict/UserDictionaryTest.java	(working copy)
@@ -73,12 +73,12 @@
     int[][] result = dictionary.lookup("日本経済新聞".toCharArray(), 0, 6);
     assertEquals(3, result.length);
     int wordIdNihon = result[0][0]; // wordId of 日本 in 日本経済新聞
-    assertEquals("ニホン", dictionary.getReading(wordIdNihon));
+    assertEquals("ニホン", dictionary.getReading(wordIdNihon, "日本".toCharArray(), 0, 2));

     result = dictionary.lookup("朝青龍".toCharArray(), 0, 3);
     assertEquals(1, result.length);
     int wordIdAsashoryu = result[0][0]; // wordId for 朝青龍
-    assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu));
+    assertEquals("アサショウリュウ", dictionary.getReading(wordIdAsashoryu, "朝青龍".toCharArray(), 0, 3));
   }

   @Test
Index: modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java
===================================================================
--- modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/kuromoji/util/BinaryDictionaryWriter.java	(working copy)
@@ -103,12 +103,15 @@
     if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
       flags |= BinaryDictionary.HAS_BASEFORM;
     }
+    if (!reading.equals(toKatakana(entry[0]))) {
+      flags |= BinaryDictionary.HAS_READING;
+    }
     if (!pronunciation.equals(reading)) {
       flags |= BinaryDictionary.HAS_PRONUNCIATION;
     }

     assert leftId == rightId;
-    assert leftId < 8192; // there are still unused bits
+    assert leftId < 4096; // there are still unused bits
     // add pos mapping
     int toFill = 1+leftId - posDict.size();
     for (int i = 0; i < toFill; i++) {
@@ -119,27 +122,36 @@
     assert existing == null || existing.equals(fullPOSData);
     posDict.set(leftId, fullPOSData);

-    buffer.putShort((short)(leftId << 2 | flags));
+    buffer.putShort((short)(leftId << 3 | flags));
     buffer.putShort(wordCost);

     if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
-      buffer.put((byte) baseForm.length());
-      for (int i = 0; i < baseForm.length(); i++) {
+      assert baseForm.length() < 16;
+      int shared = sharedPrefix(entry[0], baseForm);
+      int suffix = baseForm.length() - shared;
+      buffer.put((byte) (shared << 4 | suffix));
+      for (int i = shared; i < baseForm.length(); i++) {
         buffer.putChar(baseForm.charAt(i));
       }
     }

-    if (isKatakana(reading)) {
-      buffer.put((byte) (reading.length() << 1 | 1));
-      writeKatakana(reading);
-    } else {
-      buffer.put((byte) (reading.length() << 1));
-      for (int i = 0; i < reading.length(); i++) {
-        buffer.putChar(reading.charAt(i));
+    if ((flags & BinaryDictionary.HAS_READING) != 0) {
+      if (isKatakana(reading)) {
+        buffer.put((byte) (reading.length() << 1 | 1));
+        writeKatakana(reading);
+      } else {
+        buffer.put((byte) (reading.length() << 1));
+        for (int i = 0; i < reading.length(); i++) {
+          buffer.putChar(reading.charAt(i));
+        }
       }
     }

     if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) {
+      // we can save 150KB here, but it makes the reader a little complicated.
+      // int shared = sharedPrefix(reading, pronunciation);
+      // buffer.put((byte) shared);
+      // pronunciation = pronunciation.substring(shared);
       if (isKatakana(pronunciation)) {
         buffer.put((byte) (pronunciation.length() << 1 | 1));
         writeKatakana(pronunciation);
@@ -170,6 +182,27 @@
     }
   }

+  private String toKatakana(String s) {
+    char text[] = new char[s.length()];
+    for (int i = 0; i < s.length(); i++) {
+      char ch = s.charAt(i);
+      if (ch > 0x3040 && ch < 0x3097) {
+        text[i] = (char)(ch + 0x60);
+      } else {
+        text[i] = ch;
+      }
+    }
+    return new String(text);
+  }
+
+  public static int sharedPrefix(String left, String right) {
+    int len = left.length() < right.length() ? left.length() : right.length();
+    for (int i = 0; i < len; i++)
+      if (left.charAt(i) != right.charAt(i))
+        return i;
+    return len;
+  }
+
   public void addMapping(int sourceId, int wordId) {
     assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;

Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UserDictionary.java	(working copy)
@@ -196,7 +196,7 @@
   }

   @Override
-  public String getReading(int wordId) {
+  public String getReading(int wordId, char surface[], int off, int len) {
     return getFeature(wordId, 0);
   }

@@ -206,12 +206,12 @@
   }

   @Override
-  public String getBaseForm(int wordId) {
+  public String getBaseForm(int wordId, char surface[], int off, int len) {
     return null; // TODO: add support?
   }

   @Override
-  public String getPronunciation(int wordId) {
+  public String getPronunciation(int wordId, char surface[], int off, int len) {
     return null; // TODO: add support?
   }

Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/Dictionary.java	(working copy)
@@ -54,21 +54,21 @@
    * @param wordId word ID of token
    * @return Reading of the token
    */
-  public String getReading(int wordId);
+  public String getReading(int wordId, char surface[], int off, int len);

   /**
    * Get base form of word
    * @param wordId word ID of token
    * @return Base form (only different for inflected words, otherwise null)
    */
-  public String getBaseForm(int wordId);
+  public String getBaseForm(int wordId, char surface[], int off, int len);

   /**
    * Get pronunciation of tokens
    * @param wordId word ID of token
    * @return Pronunciation of the token
    */
-  public String getPronunciation(int wordId);
+  public String getPronunciation(int wordId, char surface[], int off, int len);

   /**
    * Get inflection type of tokens
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/BinaryDictionary.java	(working copy)
@@ -149,12 +149,12 @@

   @Override
   public int getLeftId(int wordId) {
-    return buffer.getShort(wordId) >>> 2;
+    return buffer.getShort(wordId) >>> 3;
   }

   @Override
   public int getRightId(int wordId) {
-    return buffer.getShort(wordId) >>> 2;
+    return buffer.getShort(wordId) >>> 3;
   }

   @Override
@@ -163,21 +163,42 @@
   }

   @Override
-  public String getBaseForm(int wordId) {
+  public String getBaseForm(int wordId, char surfaceForm[], int off, int len) {
     if (hasBaseFormData(wordId)) {
       int offset = baseFormOffset(wordId);
-      int length = buffer.get(offset++) & 0xff;
-      return readString(offset, length, false);
+      int data = buffer.get(offset++) & 0xff;
+      int prefix = data >>> 4;
+      int suffix = data & 0xF;
+      char text[] = new char[prefix+suffix];
+      System.arraycopy(surfaceForm, off, text, 0, prefix);
+      for (int i = 0; i < suffix; i++) {
+        text[prefix+i] = buffer.getChar(offset + (i << 1));
+      }
+      return new String(text);
     } else {
       return null;
     }
   }

   @Override
-  public String getReading(int wordId) {
-    int offset = readingOffset(wordId);
-    int readingData = buffer.get(offset++) & 0xff;
-    return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+  public String getReading(int wordId, char surface[], int off, int len) {
+    if (hasReadingData(wordId)) {
+      int offset = readingOffset(wordId);
+      int readingData = buffer.get(offset++) & 0xff;
+      return readString(offset, readingData >>> 1, (readingData & 1) == 1);
+    } else {
+      // the reading is the surface form, with hiragana shifted to katakana
+      char text[] = new char[len];
+      for (int i = 0; i < len; i++) {
+        char ch = surface[off+i];
+        if (ch > 0x3040 && ch < 0x3097) {
+          text[i] = (char)(ch + 0x60);
+        } else {
+          text[i] = ch;
+        }
+      }
+      return new String(text);
+    }
   }

   @Override
@@ -186,13 +207,13 @@
   }

   @Override
-  public String getPronunciation(int wordId) {
+  public String getPronunciation(int wordId, char surface[], int off, int len) {
     if (hasPronunciationData(wordId)) {
       int offset = pronunciationOffset(wordId);
       int pronunciationData = buffer.get(offset++) & 0xff;
       return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1);
     } else {
-      return getReading(wordId); // same as the reading
+      return getReading(wordId, surface, off, len); // same as the reading
     }
   }

@@ -213,7 +234,7 @@
   private int readingOffset(int wordId) {
     int offset = baseFormOffset(wordId);
     if (hasBaseFormData(wordId)) {
-      int baseFormLength = buffer.get(offset++) & 0xff;
+      int baseFormLength = buffer.get(offset++) & 0xf;
       return offset + (baseFormLength << 1);
     } else {
       return offset;
@@ -221,21 +242,29 @@
   }

   private int pronunciationOffset(int wordId) {
-    int offset = readingOffset(wordId);
-    int readingData = buffer.get(offset++) & 0xff;
-    final int readingLength;
-    if ((readingData & 1) == 0) {
-      readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+    if (hasReadingData(wordId)) {
+      int offset = readingOffset(wordId);
+      int readingData = buffer.get(offset++) & 0xff;
+      final int readingLength;
+      if ((readingData & 1) == 0) {
+        readingLength = readingData & 0xfe; // UTF-16: mask off kana bit
+      } else {
+        readingLength = readingData >>> 1;
+      }
+      return offset + readingLength;
     } else {
-      readingLength = readingData >>> 1;
+      return readingOffset(wordId);
     }
-    return offset + readingLength;
   }

   private boolean hasBaseFormData(int wordId) {
     return (buffer.getShort(wordId) & HAS_BASEFORM) != 0;
   }

+  private boolean hasReadingData(int wordId) {
+    return (buffer.getShort(wordId) & HAS_READING) != 0;
+  }
+
   private boolean hasPronunciationData(int wordId) {
     return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0;
   }
@@ -256,6 +285,8 @@

   /** flag that the entry has baseform data. otherwise its not inflected (same as surface form) */
   public static final int HAS_BASEFORM = 1;
+  /** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
+  public static final int HAS_READING = 2;
   /** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
-  public static final int HAS_PRONUNCIATION = 2;
+  public static final int HAS_PRONUNCIATION = 4;
 }
Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java	(working copy)
@@ -51,7 +51,7 @@
   }

   @Override
-  public String getReading(int wordId) {
+  public String getReading(int wordId, char surface[], int off, int len) {
     return null;
   }

Index: modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java
===================================================================
--- modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java	(revision 1232218)
+++ modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/Token.java	(working copy)
@@ -75,14 +75,14 @@
    * @return reading. null if token doesn't have reading.
    */
   public String getReading() {
-    return dictionary.getReading(wordId);
+    return dictionary.getReading(wordId, surfaceForm, offset, length);
   }

   /**
    * @return pronunciation. null if token doesn't have pronunciation.
    */
   public String getPronunciation() {
-    return dictionary.getPronunciation(wordId);
+    return dictionary.getPronunciation(wordId, surfaceForm, offset, length);
   }

   /**
@@ -110,7 +110,7 @@
    * @return base form or null if token is not inflected
    */
   public String getBaseForm() {
-    return dictionary.getBaseForm(wordId);
+    return dictionary.getBaseForm(wordId, surfaceForm, offset, length);
   }

   /**
Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Index: modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream