Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # ----------------------------------------------------------------------
- # phplex.py
- #
- # A lexer for PHP.
- # ----------------------------------------------------------------------
- import ply.lex as lex
- # todo: literal html
- # todo: double-quoted strings
- # todo: number literals (LNUMBER, DNUMBER)
- # todo: heredocs
- # todo: backticks
- # todo: namespaces
- # todo: casts
- # todo: "die" as alias for "exit"
- # todo: BAD_CHARACTER
- # todo: CURLY_OPEN, DOLLAR_OPEN_CURLY_BRACES, STRING_VARNAME
- # todo: <script> syntax (does anyone use this?)
- # todo: HALT_COMPILER (??)
- # Reserved words
- reserved = (
- 'ARRAY', 'AS', 'BREAK', 'CASE', 'CLASS', 'CONST', 'CONTINUE', 'DECLARE',
- 'DEFAULT', 'DO', 'ECHO', 'ELSE', 'ELSEIF', 'EMPTY', 'ENDDECLARE',
- 'ENDFOR', 'ENDFOREACH', 'ENDIF', 'ENDSWITCH', 'ENDWHILE', 'EVAL', 'EXIT',
- 'EXTENDS', 'FOR', 'FOREACH', 'FUNCTION', 'GLOBAL', 'IF', 'INCLUDE',
- 'INCLUDE_ONCE', 'INSTANCEOF', 'ISSET', 'LIST', 'NEW', 'PRINT', 'REQUIRE',
- 'REQUIRE_ONCE', 'RETURN', 'STATIC', 'SWITCH', 'UNSET', 'USE', 'VAR',
- 'WHILE', 'FINAL', 'INTERFACE', 'IMPLEMENTS', 'PUBLIC', 'PRIVATE',
- 'PROTECTED', 'ABSTRACT', 'CLONE', 'TRY', 'CATCH', 'THROW', 'CFUNCTION',
- 'OLD_FUNCTION',
- )
- tokens = reserved + (
- # Generic
- 'WHITESPACE', 'OP',
- # Operators
- 'SL', 'SR', 'BOOLEAN_OR', 'BOOLEAN_AND', 'IS_SMALLER_OR_EQUAL',
- 'IS_GREATER_OR_EQUAL', 'IS_EQUAL', 'IS_NOT_EQUAL', 'IS_IDENTICAL',
- 'IS_NOT_IDENTICAL',
- # Assignment operators
- 'MUL_EQUAL', 'DIV_EQUAL', 'MOD_EQUAL', 'PLUS_EQUAL', 'MINUS_EQUAL',
- 'SL_EQUAL', 'SR_EQUAL', 'AND_EQUAL', 'OR_EQUAL', 'XOR_EQUAL',
- 'CONCAT_EQUAL',
- # Increment/decrement
- 'INC', 'DEC',
- # Arrows
- 'OBJECT_OPERATOR', 'DOUBLE_ARROW', 'DOUBLE_COLON',
- # Comments
- 'COMMENT', 'DOC_COMMENT',
- # Escaping from HTML
- 'OPEN_TAG', 'OPEN_TAG_WITH_ECHO', 'CLOSE_TAG'
- # Identifiers and reserved words
- 'DIR', 'FILE', 'LINE', 'FUNC_C', 'CLASS_C', 'METHOD_C', 'NS_C',
- 'LOGICAL_AND', 'LOGICAL_OR', 'LOGICAL_XOR',
- 'STRING', 'VARIABLE',
- 'LNUMBER', 'DNUMBER',
- 'CONSTANT_ENCAPSED_STRING',
- )
- # Newlines
- def t_WHITESPACE(t):
- r'[ \t\r\n]+'
- t.lexer.lineno += t.value.count("\n")
- return t
- # Assignment operators
- def t_SL_EQUAL(t): r'<<='; return t
- def t_SR_EQUAL(t): r'>>='; return t
- def t_AND_EQUAL(t): r'&='; return t
- def t_OR_EQUAL(t): r'\|='; return t
- def t_XOR_EQUAL(t): r'\^='; return t
- def t_MUL_EQUAL(t): r'\*='; return t
- def t_DIV_EQUAL(t): r'/='; return t
- def t_MOD_EQUAL(t): r'%='; return t
- def t_PLUS_EQUAL(t): r'\+='; return t
- def t_MINUS_EQUAL(t): r'-='; return t
- def t_CONCAT_EQUAL(t): r'\.='; return t
- # Operators
- def t_SL(t): r'<<'; return t
- def t_SR(t): r'>>'; return t
- def t_BOOLEAN_AND(t): r'&&'; return t
- def t_BOOLEAN_OR(t): r'\|\|'; return t
- def t_IS_SMALLER_OR_EQUAL(t): r'<='; return t
- def t_IS_GREATER_OR_EQUAL(t): r'>='; return t
- def t_IS_IDENTICAL(t): r'==='; return t
- def t_IS_NOT_IDENTICAL(t): r'!=='; return t
- def t_IS_EQUAL(t): r'=='; return t
- def t_IS_NOT_EQUAL(t): r'(!=)|(<>)'; return t
- # Increment/decrement
- def t_INC(t): r'\+\+'; return t
- def t_DEC(t): r'--'; return t
- # Arrows
- def t_OBJECT_OPERATOR(t): r'->'; return t
- def t_DOUBLE_ARROW(t): r'=>'; return t
- def t_DOUBLE_COLON(t): r'::'; return t
- # Comments
- def t_DOC_COMMENT(t):
- r'/\*\*(.|\n)*?\*/'
- t.lexer.lineno += t.value.count("\n")
- return t
- def t_COMMENT(t):
- r'(/\*(.|\n)*?\*/)|(//.*?\n)|(\#.*?\n)'
- t.lexer.lineno += t.value.count("\n")
- return t
- # Escaping from HTML
- def t_OPEN_TAG(t):
- r'<[?%]((php)|=)?\n?'
- if t.value.endswith('='): t.type = 'OPEN_TAG_WITH_ECHO'
- t.lexer.lineno += t.value.count("\n")
- return t
- def t_CLOSE_TAG(t):
- r'[?%]>\n?'
- t.lexer.lineno += t.value.count("\n")
- return t
- # Identifiers and reserved words
- reserved_map = {
- '__DIR__': 'DIR',
- '__FILE__': 'FILE',
- '__LINE__': 'LINE',
- '__FUNCTION__': 'FUNC_C',
- '__CLASS__': 'CLASS_C',
- '__METHOD__': 'METHOD_C',
- '__NAMESPACE__': 'NS_C',
- 'AND': 'LOGICAL_AND',
- 'OR': 'LOGICAL_OR',
- 'XOR': 'LOGICAL_XOR',
- }
- for r in reserved:
- reserved_map[r] = r
- # Identifier
- def t_STRING(t):
- r'[A-Za-z_][\w_]*'
- t.type = reserved_map.get(t.value.upper(), 'STRING')
- return t
- # Variable
- def t_VARIABLE(t):
- r'\$[A-Za-z_][\w_]*'
- return t
- # Integer literal (todo)
- def t_LNUMBER(t):
- r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?'
- return t
- # Floating literal (todo)
- def t_DNUMBER(t):
- r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
- return t
- # String literal
- def t_CONSTANT_ENCAPSED_STRING(t):
- r'(\"([^\\\n]|(\\.))*?\")|(\'([^\\\n]|(\\.))*?\')'
- return t
- # Simple operator
- def t_OP(t):
- r'[\(\)\{\}\[\]+-/*%^&|~=<>.!,?:;@]'
- t.type = 'OP'
- return t
- def t_error(t):
- print("Illegal character %s" % repr(t.value[0]))
- t.lexer.skip(1)
- lexer = lex.lex(optimize=1)
- if __name__ == "__main__":
- lex.runmain(lexer)
Add Comment
Please, Sign In to add comment