Guest User

Untitled

a guest
May 23rd, 2018
92
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.09 KB | None | 0 0
  1. # ----------------------------------------------------------------------
  2. # phplex.py
  3. #
  4. # A lexer for PHP.
  5. # ----------------------------------------------------------------------
  6.  
  7. import ply.lex as lex
  8.  
  9. # todo: end-of-line comments containing ?>
  10. # todo: double-quoted strings
  11. # todo: number literals (LNUMBER, DNUMBER)
  12. # todo: heredocs
  13. # todo: backticks
  14. # todo: namespaces
  15. # todo: casts
  16. # todo: "die" as alias for "exit"
  17. # todo: BAD_CHARACTER
  18. # todo: CURLY_OPEN, DOLLAR_OPEN_CURLY_BRACES, STRING_VARNAME
  19. # todo: <script> syntax (does anyone use this?)
  20. # todo: HALT_COMPILER (??)
  21.  
  22. states = (
  23. ('php', 'exclusive'),
  24. )
  25.  
  26. # Reserved words
  27. reserved = (
  28. 'ARRAY', 'AS', 'BREAK', 'CASE', 'CLASS', 'CONST', 'CONTINUE', 'DECLARE',
  29. 'DEFAULT', 'DO', 'ECHO', 'ELSE', 'ELSEIF', 'EMPTY', 'ENDDECLARE',
  30. 'ENDFOR', 'ENDFOREACH', 'ENDIF', 'ENDSWITCH', 'ENDWHILE', 'EVAL', 'EXIT',
  31. 'EXTENDS', 'FOR', 'FOREACH', 'FUNCTION', 'GLOBAL', 'IF', 'INCLUDE',
  32. 'INCLUDE_ONCE', 'INSTANCEOF', 'ISSET', 'LIST', 'NEW', 'PRINT', 'REQUIRE',
  33. 'REQUIRE_ONCE', 'RETURN', 'STATIC', 'SWITCH', 'UNSET', 'USE', 'VAR',
  34. 'WHILE', 'FINAL', 'INTERFACE', 'IMPLEMENTS', 'PUBLIC', 'PRIVATE',
  35. 'PROTECTED', 'ABSTRACT', 'CLONE', 'TRY', 'CATCH', 'THROW', 'CFUNCTION',
  36. 'OLD_FUNCTION',
  37. )
  38.  
  39. tokens = reserved + (
  40. 'WHITESPACE', 'INLINE_HTML',
  41.  
  42. # Operators
  43. 'PLUS', 'MINUS', 'MUL', 'DIV', 'MOD', 'AND', 'OR', 'NOT', 'XOR', 'SL',
  44. 'SR', 'BOOLEAN_AND', 'BOOLEAN_OR', 'BOOLEAN_NOT', 'IS_SMALLER',
  45. 'IS_GREATER', 'IS_SMALLER_OR_EQUAL', 'IS_GREATER_OR_EQUAL', 'IS_EQUAL',
  46. 'IS_NOT_EQUAL', 'IS_IDENTICAL', 'IS_NOT_IDENTICAL',
  47.  
  48. # Assignment operators
  49. 'EQUALS', 'MUL_EQUAL', 'DIV_EQUAL', 'MOD_EQUAL', 'PLUS_EQUAL',
  50. 'MINUS_EQUAL', 'SL_EQUAL', 'SR_EQUAL', 'AND_EQUAL', 'OR_EQUAL',
  51. 'XOR_EQUAL', 'CONCAT_EQUAL',
  52.  
  53. # Increment/decrement
  54. 'INC', 'DEC',
  55.  
  56. # Arrows
  57. 'OBJECT_OPERATOR', 'DOUBLE_ARROW', 'DOUBLE_COLON',
  58.  
  59. # Delimiters
  60. 'LPAREN', 'RPAREN', 'LBRACKET', 'RBRACKET', 'LBRACE', 'RBRACE', 'COMMA',
  61. 'CONCAT', 'QUESTION', 'COLON', 'SEMI', 'AT',
  62.  
  63. # Comments
  64. 'COMMENT', 'DOC_COMMENT',
  65.  
  66. # Escaping from HTML
  67. 'OPEN_TAG', 'OPEN_TAG_WITH_ECHO', 'CLOSE_TAG',
  68.  
  69. # Identifiers and reserved words
  70. 'DIR', 'FILE', 'LINE', 'FUNC_C', 'CLASS_C', 'METHOD_C', 'NS_C',
  71. 'LOGICAL_AND', 'LOGICAL_OR', 'LOGICAL_XOR',
  72. 'STRING', 'VARIABLE',
  73. 'LNUMBER', 'DNUMBER',
  74. 'CONSTANT_ENCAPSED_STRING',
  75. )
  76.  
  77. # Newlines
  78. def t_php_WHITESPACE(t):
  79. r'[ \t\r\n]+'
  80. t.lexer.lineno += t.value.count("\n")
  81. return t
  82.  
  83. # Operators
  84. t_php_PLUS = r'\+'
  85. t_php_MINUS = r'-'
  86. t_php_MUL = r'\*'
  87. t_php_DIV = r'/'
  88. t_php_MOD = r'%'
  89. t_php_AND = r'&'
  90. t_php_OR = r'\|'
  91. t_php_NOT = r'~'
  92. t_php_XOR = r'\^'
  93. t_php_SL = r'<<'
  94. t_php_SR = r'>>'
  95. t_php_BOOLEAN_AND = r'&&'
  96. t_php_BOOLEAN_OR = r'\|\|'
  97. t_php_BOOLEAN_NOT = r'!'
  98. t_php_IS_SMALLER = r'<'
  99. t_php_IS_GREATER = r'>'
  100. t_php_IS_SMALLER_OR_EQUAL = r'<='
  101. t_php_IS_GREATER_OR_EQUAL = r'>='
  102. t_php_IS_EQUAL = r'=='
  103. t_php_IS_NOT_EQUAL = r'(!=)|(<>)'
  104. t_php_IS_IDENTICAL = r'==='
  105. t_php_IS_NOT_IDENTICAL = r'!=='
  106.  
  107. # Assignment operators
  108. t_php_EQUALS = r'='
  109. t_php_MUL_EQUAL = r'\*='
  110. t_php_DIV_EQUAL = r'/='
  111. t_php_MOD_EQUAL = r'%='
  112. t_php_PLUS_EQUAL = r'\+='
  113. t_php_MINUS_EQUAL = r'-='
  114. t_php_SL_EQUAL = r'<<='
  115. t_php_SR_EQUAL = r'>>='
  116. t_php_AND_EQUAL = r'&='
  117. t_php_OR_EQUAL = r'\|='
  118. t_php_XOR_EQUAL = r'\^='
  119. t_php_CONCAT_EQUAL = r'\.='
  120.  
  121. # Increment/decrement
  122. t_php_INC = r'\+\+'
  123. t_php_DEC = r'--'
  124.  
  125. # Arrows
  126. t_php_OBJECT_OPERATOR = r'->'
  127. t_php_DOUBLE_ARROW = r'=>'
  128. t_php_DOUBLE_COLON = r'::'
  129.  
  130. # Delimeters
  131. t_php_LPAREN = r'\('
  132. t_php_RPAREN = r'\)'
  133. t_php_LBRACKET = r'\['
  134. t_php_RBRACKET = r'\]'
  135. t_php_LBRACE = r'\{'
  136. t_php_RBRACE = r'\}'
  137. t_php_COMMA = r','
  138. t_php_CONCAT = r'\.'
  139. t_php_QUESTION = r'\?'
  140. t_php_COLON = r':'
  141. t_php_SEMI = r';'
  142. t_php_AT = r'@'
  143.  
  144. # Comments
  145.  
  146. def t_php_DOC_COMMENT(t):
  147. r'/\*\*(.|\n)*?\*/'
  148. t.lexer.lineno += t.value.count("\n")
  149. return t
  150.  
  151. def t_php_COMMENT(t):
  152. r'(/\*(.|\n)*?\*/)|(//.*?\n)|(\#.*?\n)'
  153. t.lexer.lineno += t.value.count("\n")
  154. return t
  155.  
  156. # Escaping from HTML
  157.  
  158. def t_OPEN_TAG(t):
  159. r'<[?%]((php)|=)?[ \t\r]*\n?'
  160. if t.value.endswith('='): t.type = 'OPEN_TAG_WITH_ECHO'
  161. t.lexer.lineno += t.value.count("\n")
  162. t.lexer.begin('php')
  163. return t
  164.  
  165. def t_php_CLOSE_TAG(t):
  166. r'[?%]>[ \t\r]*\n?'
  167. t.lexer.lineno += t.value.count("\n")
  168. t.lexer.begin('INITIAL')
  169. return t
  170.  
  171. def t_INLINE_HTML(t):
  172. r'(([^<])|(<(?![?%])))+'
  173. t.lexer.lineno += t.value.count("\n")
  174. return t
  175.  
  176. # Identifiers and reserved words
  177.  
  178. reserved_map = {
  179. '__DIR__': 'DIR',
  180. '__FILE__': 'FILE',
  181. '__LINE__': 'LINE',
  182. '__FUNCTION__': 'FUNC_C',
  183. '__CLASS__': 'CLASS_C',
  184. '__METHOD__': 'METHOD_C',
  185. '__NAMESPACE__': 'NS_C',
  186.  
  187. 'AND': 'LOGICAL_AND',
  188. 'OR': 'LOGICAL_OR',
  189. 'XOR': 'LOGICAL_XOR',
  190. }
  191.  
  192. for r in reserved:
  193. reserved_map[r] = r
  194.  
  195. # Identifier
  196. def t_php_STRING(t):
  197. r'[A-Za-z_][\w_]*'
  198. t.type = reserved_map.get(t.value.upper(), 'STRING')
  199. return t
  200.  
  201. # Variable
  202. def t_php_VARIABLE(t):
  203. r'\$[A-Za-z_][\w_]*'
  204. return t
  205.  
  206. # Integer literal (todo)
  207. def t_php_LNUMBER(t):
  208. r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?'
  209. return t
  210.  
  211. # Floating literal (todo)
  212. def t_php_DNUMBER(t):
  213. r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'
  214. return t
  215.  
  216. # String literal
  217. def t_php_CONSTANT_ENCAPSED_STRING(t):
  218. r'(\"([^\\\n]|(\\.))*?\")|(\'([^\\\n]|(\\.))*?\')'
  219. return t
  220.  
  221. def t_ANY_error(t):
  222. print("Illegal character %s" % repr(t.value[0]))
  223. t.lexer.skip(1)
  224.  
  225. lexer = lex.lex(optimize=0)
  226. if __name__ == "__main__":
  227. lex.runmain(lexer)
Add Comment
Please, Sign In to add comment