Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- package org.apache.lucene.analysis.standard;
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /*
- WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate
- the tokenizer, only use Java 1.4 !!!
- This grammar currently uses constructs (eg :digit:, :letter:) whose
- meaning can vary according to the JRE used to run jflex. See
- https://issues.apache.org/jira/browse/LUCENE-1126 for details.
- For current backwards compatibility it is needed to support
- only Java 1.4 - this will change in Lucene 3.1.
- */
- import org.apache.lucene.analysis.Token;
- import org.apache.lucene.analysis.tokenattributes.TermAttribute;
- %%
- %class MyTokenizerImpl
- %unicode
- %integer
- %function getNextToken
- %pack
- %char
- %{
- public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
- public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
- public static final int ACRONYM = StandardTokenizer.ACRONYM;
- public static final int COMPANY = StandardTokenizer.COMPANY;
- public static final int EMAIL = StandardTokenizer.EMAIL;
- public static final int HOST = StandardTokenizer.HOST;
- public static final int NUM = StandardTokenizer.NUM;
- public static final int CJ = StandardTokenizer.CJ;
- /**
- * @deprecated this solves a bug where HOSTs that end with '.' are identified
- * as ACRONYMs.
- */
- @Deprecated
- public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
- public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
- public final int yychar()
- {
- return yychar;
- }
- /**
- * Fills Lucene token with the current token text.
- */
- final void getText(Token t) {
- t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
- }
- /**
- * Fills TermAttribute with the current token text.
- */
- final void getText(TermAttribute t) {
- t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
- }
- %}
- THAI = [\u0E00-\u0E59]
- // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
- ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
- // internal apostrophes: O'Reilly, you're, O'Reilly's
- // use a post-filter to remove possessives
- APOSTROPHE = {ALPHA} ("'" {ALPHA})+
- // acronyms: U.S.A., I.B.M., etc.
- // use a post-filter to remove dots
- ACRONYM = {LETTER} "." ({LETTER} ".")+
- ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
- // company names like AT&T and Excite@Home.
- COMPANY = {ALPHA} ("&"|"@") {ALPHA}
- // email addresses
- EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
- // hostname
- HOST = {ALPHANUM} ((".") {ALPHANUM})+
- // floating point, serial, model numbers, ip addresses, etc.
- // every other segment must have at least one digit
- NUM = (
- {ALPHANUM} {P} {ALPHANUM}
- | {ALPHANUM} ({P} {ALPHANUM})+
- | {HAS_DIGIT} ({P} {ALPHANUM})+
- | {ALPHANUM} {P} {HAS_DIGIT}
- | {HAS_DIGIT} {P} {ALPHANUM}
- | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
- | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
- | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
- | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
- // punctuation
- P = ("_"|"-"|"/"|"."|",")
- // at least one digit
- HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
- ALPHA = ({LETTER})+
- // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
- LETTER = !(![:letter:]|{CJ})
- // Chinese and Japanese (but NOT Korean, which is included in [:letter:])
- CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
- WHITESPACE = \r\n | [ \r\n\t\f]
- %%
- {ALPHANUM} { return ALPHANUM; }
- {APOSTROPHE} { return APOSTROPHE; }
- {ACRONYM} { return ACRONYM; }
- {COMPANY} { return COMPANY; }
- {EMAIL} { return EMAIL; }
- {HOST} { return HOST; }
- {NUM} { return NUM; }
- {CJ} { return CJ; }
- {ACRONYM_DEP} { return ACRONYM_DEP; }
- /** Ignore the rest */
- . | {WHITESPACE} { /* ignore */ }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement