Advertisement
Guest User

MyTokenizer.jflex

a guest
May 29th, 2013
110
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.43 KB | None
  1. package org.apache.lucene.analysis.standard;
  2.  
  3. /**
  4. * Licensed to the Apache Software Foundation (ASF) under one or more
  5. * contributor license agreements. See the NOTICE file distributed with
  6. * this work for additional information regarding copyright ownership.
  7. * The ASF licenses this file to You under the Apache License, Version 2.0
  8. * (the "License"); you may not use this file except in compliance with
  9. * the License. You may obtain a copy of the License at
  10. *
  11. * http://www.apache.org/licenses/LICENSE-2.0
  12. *
  13. * Unless required by applicable law or agreed to in writing, software
  14. * distributed under the License is distributed on an "AS IS" BASIS,
  15. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16. * See the License for the specific language governing permissions and
  17. * limitations under the License.
  18. */
  19.  
  20. /*
  21.  
  22. WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
  23. the tokenizer, only use Java 1.4 !!!
  24. This grammar currently uses constructs (eg :digit:, :letter:) whose
  25. meaning can vary according to the JRE used to run jflex. See
  26. https://issues.apache.org/jira/browse/LUCENE-1126 for details.
  27. For current backwards compatibility it is needed to support
  28. only Java 1.4 - this will change in Lucene 3.1.
  29.  
  30. */
  31.  
  32. import org.apache.lucene.analysis.Token;
  33. import org.apache.lucene.analysis.tokenattributes.TermAttribute;
  34.  
  35. %%
  36.  
  37. %class MyTokenizerImpl
  38. %unicode
  39. %integer
  40. %function getNextToken
  41. %pack
  42. %char
  43.  
  44. %{
  45.  
  46. public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
  47. public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
  48. public static final int ACRONYM = StandardTokenizer.ACRONYM;
  49. public static final int COMPANY = StandardTokenizer.COMPANY;
  50. public static final int EMAIL = StandardTokenizer.EMAIL;
  51. public static final int HOST = StandardTokenizer.HOST;
  52. public static final int NUM = StandardTokenizer.NUM;
  53. public static final int CJ = StandardTokenizer.CJ;
  54. /**
  55. * @deprecated this solves a bug where HOSTs that end with '.' are identified
  56. * as ACRONYMs.
  57. */
  58. @Deprecated
  59. public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
  60.  
  61. public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
  62.  
  63. public final int yychar()
  64. {
  65. return yychar;
  66. }
  67.  
  68. /**
  69. * Fills Lucene token with the current token text.
  70. */
  71. final void getText(Token t) {
  72. t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
  73. }
  74.  
  75. /**
  76. * Fills TermAttribute with the current token text.
  77. */
  78. final void getText(TermAttribute t) {
  79. t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
  80. }
  81.  
  82. %}
  83.  
  84. THAI = [\u0E00-\u0E59]
  85.  
  86. // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
  87. ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
  88.  
  89. // internal apostrophes: O'Reilly, you're, O'Reilly's
  90. // use a post-filter to remove possessives
  91. APOSTROPHE = {ALPHA} ("'" {ALPHA})+
  92.  
  93. // acronyms: U.S.A., I.B.M., etc.
  94. // use a post-filter to remove dots
  95. ACRONYM = {LETTER} "." ({LETTER} ".")+
  96.  
  97. ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
  98.  
  99. // company names like AT&T and Excite@Home.
  100. COMPANY = {ALPHA} ("&"|"@") {ALPHA}
  101.  
  102. // email addresses
  103. EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
  104.  
  105. // hostname
  106. HOST = {ALPHANUM} ((".") {ALPHANUM})+
  107.  
  108. // floating point, serial, model numbers, ip addresses, etc.
  109. // every other segment must have at least one digit
  110. NUM = (
  111. {ALPHANUM} {P} {ALPHANUM}
  112. | {ALPHANUM} ({P} {ALPHANUM})+
  113. | {HAS_DIGIT} ({P} {ALPHANUM})+
  114. | {ALPHANUM} {P} {HAS_DIGIT}
  115. | {HAS_DIGIT} {P} {ALPHANUM}
  116. | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
  117. | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
  118. | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
  119. | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
  120.  
  121. // punctuation
  122. P = ("_"|"-"|"/"|"."|",")
  123.  
  124. // at least one digit
  125. HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
  126.  
  127. ALPHA = ({LETTER})+
  128.  
  129. // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
  130. LETTER = !(![:letter:]|{CJ})
  131.  
  132. // Chinese and Japanese (but NOT Korean, which is included in [:letter:])
  133. CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
  134.  
  135. WHITESPACE = \r\n | [ \r\n\t\f]
  136.  
  137. %%
  138.  
  139. {ALPHANUM} { return ALPHANUM; }
  140. {APOSTROPHE} { return APOSTROPHE; }
  141. {ACRONYM} { return ACRONYM; }
  142. {COMPANY} { return COMPANY; }
  143. {EMAIL} { return EMAIL; }
  144. {HOST} { return HOST; }
  145. {NUM} { return NUM; }
  146. {CJ} { return CJ; }
  147. {ACRONYM_DEP} { return ACRONYM_DEP; }
  148.  
  149. /** Ignore the rest */
  150. . | {WHITESPACE} { /* ignore */ }
Advertisement
RAW Paste Data Copied
Advertisement