Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- lexer grammar CheepLexer;
- options {
- filter=true;
- }
- tokens {
- CURRENCY_SYMBOL;
- VALUE;
- AT_SYMBOL;
- HASH_SYMBOL;
- STATEMENT;
- }
- @header {
- require 'yaml'
- }
- @init {
- @statements = YAML::load(File.open( 'statements.yml' ))
- @token_buffer = Array.new
- @got_statement = false
- @statement_type = nil
- @cheep_lang = :en
- }
- @members {
- attr_reader :cheep_lang
- attr_reader :statement_type
- def emit( token = @state.token )
- token ||= create_token
- @state.token = token
- @token_buffer.push token
- return token
- end
- def advance_input
- @state.token_start_position = @input.index
- @state.token_start_column = @input.column
- @state.token_start_line = @input.line
- @state.channel=DEFAULT_CHANNEL;
- end
- def next_token
- loop do
- if @token_buffer.empty?
- @state.token = nil
- @state.channel = DEFAULT_CHANNEL
- @state.token_start_position = @input.index
- @state.token_start_column = @input.column
- @state.token_start_line = @input.line
- @state.text = nil
- @input.peek == EOF and return EOF_TOKEN
- begin
- token!
- case token = @state.token
- when nil then emit
- when SKIP_TOKEN then next
- end
- rescue NoViableAlternative => re
- report_error( re )
- recover( re )
- rescue Error::RecognitionError => re
- report_error( re )
- end
- elsif
- tk = @token_buffer.shift
- return tk if (tk != SKIP_TOKEN || tk != nil)
- end
- end
- end
- }
- WS
- @init {$channel=ANTLR3::HIDDEN_CHANNEL}
- : (' '|'\r'|'\t'|'\u000C'|'\n'|EOF)
- ;
- TWEET_VIA_STATEMENT
- : 'via'
- {@state.type = TWEET_VIA_STATEMENT; @state.token = nil; emit; advance_input;}
- SPACES?
- USER
- ;
- TWEET_RT_STATEMENT
- : 'rt'
- {@state.type = TWEET_RT_STATEMENT; @state.token = nil; emit; advance_input;}
- SPACES?
- USER
- ;
- TWEET_CC_STATEMENT
- : '/' SPACES? 'cc'
- {@state.type = TWEET_CC_STATEMENT; @state.token = nil; emit; advance_input;}
- SPACES? USER
- ;
- MONEY
- : (('us'|'u'|'r')? '$'|'\u20ac')
- {@state.type = CURRENCY_SYMBOL; @state.token = nil; emit; advance_input;}
- SPACES?
- NUMBER
- {@state.type = VALUE; @state.token = nil; emit; advance_input;}
- ;
- NUMBER
- : (('+'|'-') SPACES?)? ('0'..'9')+ (('.'|',') ('0'..'9')+)* WS {@state.text = self.text.strip}
- ;
- USER
- : '@'
- {@state.type = AT_SYMBOL; @state.token = nil; emit; advance_input;}
- ('0'..'9'|('a'..'z')|('A'..'Z')|'_')+
- {@state.type = USER; @state.token = nil; emit; advance_input;}
- ;
- HASH_TAG
- : '#'
- {@state.type = HASH_SYMBOL; @state.token = nil; emit; advance_input;}
- ~(' '|'\r'|'\t'|'\u000C'|'\n')+
- {@state.type = HASH_TAG; @state.token = nil; emit; advance_input;}
- (' '|'\r'|'\t'|'\u000C'|'\n')
- {@state.type = WS; @state.channel=ANTLR3::HIDDEN_CHANNEL; @state.token = nil; emit; advance_input;}
- ;
- SYMBOL
- : '{' | '}' | '[' | ']' | '(' | ')' | '.' | '*' | ';' | '&' | ':' | ',' | '<' | '>' | '=' | '%' | '^' | '~'
- ;
- QUESTION
- : '?'
- ;
- EXCLAMATION
- : '!'
- ;
- TEXT
- : (LETTER|DIGIT)+
- { if !@got_statement
- word = self.text
- @statements.each do |key, inner|
- inner.each do |lang, value|
- if value.include?(word.downcase)
- @state.type = STATEMENT;
- @statement_type = key.to_sym
- @cheep_lang = lang.to_sym
- @got_statement = true;
- emit
- advance_input
- return
- end
- end
- end
- end }
- ;
- UNKNOWN
- : ~(' '|'\r'|'\t'|'\u000C'|'\n')+ (' '|'\r'|'\t'|'\u000C'|'\n'|EOF)
- ;
- fragment
- SPACES
- : (' '|'\r'|'\t'|'\u000C'|'\n')+
- {@state.type = WS; @state.channel=ANTLR3::HIDDEN_CHANNEL; @state.token = nil; emit; advance_input;}
- ;
- fragment
- LETTER
- : '\u0024' |
- '\u0041'..'\u005a' |
- '\u005f' |
- '\u0061'..'\u007a' |
- '\u00c0'..'\u00d6' |
- '\u00d8'..'\u00f6' |
- '\u00f8'..'\u00ff' |
- '\u0100'..'\u1fff' |
- '\u3040'..'\u318f' |
- '\u3300'..'\u337f' |
- '\u3400'..'\u3d2d' |
- '\u4e00'..'\u9fff' |
- '\uf900'..'\ufaff'
- ;
- fragment
- DIGIT
- : '\u0030'..'\u0039' |
- '\u0660'..'\u0669' |
- '\u06f0'..'\u06f9' |
- '\u0966'..'\u096f' |
- '\u09e6'..'\u09ef' |
- '\u0a66'..'\u0a6f' |
- '\u0ae6'..'\u0aef' |
- '\u0b66'..'\u0b6f' |
- '\u0be7'..'\u0bef' |
- '\u0c66'..'\u0c6f' |
- '\u0ce6'..'\u0cef' |
- '\u0d66'..'\u0d6f' |
- '\u0e50'..'\u0e59' |
- '\u0ed0'..'\u0ed9' |
- '\u1040'..'\u1049'
- ;
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement