Advertisement
Guest User

Untitled

a guest
Aug 6th, 2010
1,189
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.28 KB | None | 0 0
  1. ###################s#################################################
  2. ## felipe.andres.manzano@gmail.com  http://feliam.wordpress.com/  ##
  3. ## twitter.com/feliam        http://www.linkedin.com/in/fmanzano  ##
  4. ####################################################################
  5. # PDF scanner/tokenizer
  6.  
  7. import sys
  8. import ply.lex as lex
  9. TOKEN = lex.TOKEN
  10.  
  11. #Terrible hack to mitigate the streams containing "endstream" tokens thing
  12. stream_len = None
  13.  
  14. # Tokens
  15. tokens = ('HEXSTRING','STRING', 'NUMBER', 'NAME', 'LEFT_SQUARE_BRACKET', 'RIGHT_SQUARE_BRACKET',
  16.           'NULL', 'TRUE', 'FALSE', 'R', 'DOUBLE_LESS_THAN_SIGN', 'DOUBLE_GREATER_THAN_SIGN',
  17.           'STREAM_DATA', 'OBJ', 'ENDOBJ', 'HEADER', 'TRAILER', 'EOF', 'STARTXREF' ,
  18.           'XREF' )
  19.  
  20. #different lexers used..
  21. states = ( ('string', 'exclusive'),
  22.            ('name', 'exclusive'),
  23.            ('xref', 'exclusive'),
  24.          )
  25.  
  26. #7.2.2 Character Set
  27. #7.2.2      Character Set
  28. #The PDF character set is divided into three classes, called regular,
  29. #delimiter, and white-space characters. This classification determines
  30. #the grouping of characters into tokens. The rules defined in this
  31. #sub-clause apply to all characters in the file except within strings,
  32.  
  33. #streams, and comments.
  34. white_spaces_r = r"\x20\r\n\t\x0c\x00"
  35. white_spaces = "\x20\r\n\t\x0c\x00"
  36.  
  37. #The delimiter characters (, ), <, >, [, ], {, }, /, and %
  38. delimiters = r"()<>[]/%" #This is odd: {} ?
  39. delimiters_r = r"()<>\[\]/%" #This is odd: {} ?
  40.  
  41. #The CARRIAGE RETURN (0Dh) and LINE FEED (0Ah) characters, also called
  42. #newline characters, shall be treated as end-of-line (EOL) markers. The
  43. #combination of a CARRIAGE RETURN followed immediately by a LINE FEED
  44. #shall be treated as one EOL marker.
  45. eol = r'(\r|\n|\r\n)'
  46.  
  47.  
  48. #########################################################################
  49. #INITIAL lexer
  50.  
  51. #7.3.2 Boolean Objects
  52. #Boolean objects represent the logical values of true and false. They appear
  53. #in PDF files using the keywords true and false.
  54. t_TRUE = "true"
  55. t_FALSE = "false"
  56.  
  57. #################################################################################
  58. #string lexer
  59. #7.3.4.2    Literal Strings
  60.  
  61. #A literal string shall be written as an arbitrary number of characters
  62. #enclosed in parentheses. Any characters may appear in a string except
  63. #unbalanced parentheses and the backslash, which shall be treated
  64. #specially as described in this sub-clause. Balanced pairs of
  65. #parentheses within a string require no special treatment.
  66.  
  67. #EXAMPLE 1        The following are valid literal strings:
  68. #                 ( This is a string )
  69. #                 ( Strings may contain newlines
  70. #                 and such . )
  71. #                 ( Strings may contain balanced parentheses ( ) and
  72. #                 special characters ( * ! & } ^ % and so on ) . )
  73. #                 ( The following is an empty string . )
  74. #                 ()
  75. #                 ( It has zero ( 0 ) length . )
  76.  
  77.  
  78. #An end-of-line marker appearing within a literal string without a
  79. #preceding REVERSE SOLIDUS shall be treated as a byte value of (0Ah),
  80. #irrespective of whether the end-of-line marker was a CARRIAGE RETURN
  81. #(0Dh), a LINE FEED (0Ah), or both.
  82. @TOKEN(eol)
  83. def t_string_LITERAL_STRING_EOL(t):
  84.     t.lexer.string += "\x0A"
  85.    
  86. @TOKEN(r'\\([nrtbf()\\]|[0-7]{1,3}|'+eol+')')    
  87. def t_string_ESCAPED_SEQUENCE(t):
  88.     val = t.value[1:]
  89.     if val[0] in '0123':
  90.         value = chr(int(val,8))
  91.     elif val[0] in '4567':
  92.         value = chr(int(val[:2],8)) + val[3:]
  93.     else:  
  94.         value = { "\n": "", "\r": "", "n": "\n", "r": "\r", "t": "\t", "b": "\b", "f": "\f", "(": "(", ")": ")", "\\": "\\" }[val[0]]
  95.     t.lexer.string += value
  96.  
  97. #PDF string insanity..
  98. def t_string_LEFT_PARENTHESIS(t):
  99.     r"\("
  100.     t.lexer.push_state('string')
  101.     t.lexer.string += "("
  102.    
  103. def t_string_RIGHT_PARENTHESIS(t):
  104.     r"\)"
  105.     t.lexer.pop_state()
  106.     if t.lexer.current_state() == 'string':
  107.         t.lexer.string += ")"
  108.     else:
  109.         t.type  = "STRING"
  110.         t.value = t.lexer.string
  111.         return t
  112.        
  113. def t_string_LITERAL_STRING_CHAR(t):
  114.     r'.'
  115.     t.lexer.string += t.value
  116.  
  117. #TODO: Log, increment a warning counter, or even dismiss the file  
  118. def t_string_error(t):
  119.     print "Error scanning a literal string at %d\n"%t.lexer.lexpos
  120.     t.type  = "STRING"
  121.     t.value = t.lexer.string
  122.     t.lexer.skip(1)
  123.     return t
  124.    
  125. def t_STRING(t):
  126.     r"\("
  127.     t.lexer.push_state('string')
  128.     t.lexer.string = ""
  129.    
  130. #7.3.4.3    Hexadecimal Strings
  131. #Strings may also be written in hexadecimal form, which is useful for
  132. #including arbitrary binary data in a PDF file.A hexadecimal string shall
  133. #be written as a sequence of hexadecimal digits (0-9 and either A-F or a-f)
  134. #encoded as ASCII characters and enclosed within angle brackets < and >.
  135. #EXAMPLE 1          < 4E6F762073686D6F7A206B6120706F702E >
  136. #Each pair of hexadecimal digits defines one byte of the string. White-space
  137. #characters shall be ignored. If the final digit of a hexadecimal string is
  138. #missing -that is, if there is an odd number of digits- the final digit shall be
  139. #assumed to be 0.
  140.  
  141. @TOKEN(r'<[a-fA-F0-9'+white_spaces_r+']*>')
  142. def t_HEXSTRING(t):
  143.     t.value =  ''.join([c for c in t.value if c not in white_spaces+"<>"])
  144.     t.value =  (t.value+('0'*(len(t.value)%2))).decode('hex')
  145.     return t
  146.  
  147.    
  148. #7.3.5      Name Objects
  149. #Beginning with PDF 1.2 a name object is an atomic symbol uniquely
  150. #defined by a sequence of any characters (8-bit values) except null
  151. #(character code 0).
  152. #
  153. #When writing a name in a PDF file, a SOLIDUS (2Fh) (/) shall be used to
  154. #introduce a name. The SOLIDUS is not part of the name but is a prefix
  155. #indicating that what follows is a sequence of characters representing
  156. #the name in the PDF file and shall follow these rules:
  157. #a)  A NUMBER SIGN (23h) (#) in a name shall be written by using its
  158. #    2-digit hexadecimal code (23), preceded     by the NUMBER SIGN.
  159. #b)  Any character in a name that is a regular character (other than
  160. #    NUMBER SIGN) shall be written as itself or by using its 2-digit
  161. #    hexadecimal code, preceded by the NUMBER SIGN.
  162. #c)  Any character that is not a regular character shall be written using
  163. #    its 2-digit hexadecimal code, preceded     by the NUMBER SIGN only.
  164.  
  165. def t_NAME(t):
  166.     r'/'
  167.     t.lexer.push_state('name')    
  168.     t.lexer.name = ""
  169.     t.lexer.start = t.lexpos
  170.  
  171. def t_name_HEXCHAR(t):
  172.     r'\#[0-9a-fA-F]{2}'
  173.     #Beginning with PDF 1.2 a name object is an atomic symbol uniquely
  174.     #defined by a sequence of any characters (8-bit values) except null (character code 0).
  175.     assert t.value != "#00"
  176.     t.lexer.name += t.value[1:].decode('hex')
  177.  
  178. @TOKEN(r'[^'+white_spaces_r+delimiters_r+']')
  179. def t_name_NAMECHAR(t):
  180.     t.lexer.name += t.value
  181.    
  182.    
  183. @TOKEN(r'['+white_spaces_r+delimiters_r+']')
  184. def t_name_WHITESPACE(t):
  185.     global stream_len
  186.     t.lexer.pop_state()
  187.     t.lexer.lexpos -= 1
  188.     t.lexpos = t.lexer.start
  189.     t.type  = "NAME"
  190.     t.value = t.lexer.name
  191.     t.lexer.name=""
  192.     if t.value == "Length":
  193.         stream_len = None
  194.     return t
  195.  
  196. def t_name_error(t):
  197.     print "Name error",t.lexer.lexpos, t.lexer.name
  198.  
  199. #7.3.6 Array Objects
  200. #An array shall be written as a sequence of objects enclosed in [ and ].
  201. #EXAMPLE         [ 549 3.14 false ( Ralph ) /SomeName ]
  202. t_LEFT_SQUARE_BRACKET = r"\["
  203. t_RIGHT_SQUARE_BRACKET = r"\]"
  204.  
  205. #7.3.7      Dictionary Objects
  206. #A dictionary shall be written as a sequence of key-value pairs
  207. #enclosed in double angle brackets (<< ... >>)
  208. t_DOUBLE_LESS_THAN_SIGN = r'<<'
  209. t_DOUBLE_GREATER_THAN_SIGN = r'>>'
  210.  
  211. ############################################################################
  212. #7.3.8     Stream Objects
  213. #A stream object, like a string object, is a sequence of bytes. A stream
  214. #shall consist of a dictionary followed by zero or more bytes bracketed between
  215. #the keywords stream(followed by newline) and endstream
  216.  
  217. #The keyword stream that follows the stream dictionary shall be followed by an
  218. #end-of-line marker consisting of either a CARRIAGE RETURN and a LINE FEED or
  219. #just a LINE FEED, and not by a CARRIAGE RETURN alone.
  220. def t_STREAM_DATA(t):
  221.     r'stream(\r\n|\n)'
  222.     global stream_len
  223.     if stream_len and stream_len > 0:
  224.         found = t.lexer.lexdata.find('endstream',t.lexer.lexpos+stream_len)
  225.     else:
  226.         found = t.lexer.lexdata.find('endstream',t.lexer.lexpos)
  227.     stream_len = None
  228.    
  229.     if found != -1:
  230.         chop = 0
  231.  
  232.         if t.lexer.lexdata[found-3] == '\r':
  233.             chop = {'\r':1, '\n':2}[t.lexer.lexdata[found-2]]
  234.         elif t.lexer.lexdata[found-2] in ['\n','\r']:
  235.             chop = 1
  236.         else:
  237.             #TODO log errors
  238.             #print "Warning in endstream"
  239.             pass
  240.         t.value = t.lexer.lexdata[t.lexer.lexpos: found -1 - chop]
  241.         t.lexer.lexpos = found + 9
  242.         t.type  = "STREAM_DATA"
  243.     else:
  244.         raise Exception("Error:Parsing:Lexer: COuld not found endstream string.")
  245.     return t
  246.  
  247. #7.3.9      Null Object
  248. #The null object has a type and value that are unequal to those of any
  249. #other object. There shall be only one object of type null, denoted by
  250. #the keyword null.
  251. t_NULL = r'null'
  252.  
  253. #7.3.10   Indirect Objects
  254. #Any object in a PDF file may be labelled as an indirect object.The
  255. #definition of an indirect object in a PDF file shall consist of its
  256. #object number and generation number(separated by white space),
  257. #followed by the value of the object bracketed between the keywords
  258. #obj and endobj.
  259. def t_OBJ(t):
  260.     r'\d+\x20\d+\x20obj' #[0-9]{1,10} [0-9]+ obj'
  261.     t.value = tuple(t.value.split("\x20")[:2])
  262.     return t
  263. t_ENDOBJ = r'endobj'
  264.  
  265. #The object may be referred to from elsewhere in the file by an indirect
  266. #reference. Such indirect references shall consist of the object number,
  267. #the generation number, and the keyword R (with white space separating each
  268. #part):
  269. #EXAMPLE     12 0 R
  270. def t_R(t):
  271.     r'\d+\x20\d+\x20R'
  272.     t.value = tuple([int(x,10) for x in t.value.split("\x20")[:2] ])
  273.     return t
  274.    
  275. #7.3.3 Numeric Objects
  276. #PDF provides two types of numeric objects: integer and real. Integer objects
  277. #represent mathematical integers. Real objects represent mathematical real numbers.
  278. def t_NUMBER(t):
  279.     r'[+-]{0,1}(\d*\.\d+|\d+\.\d*|\d+)' #34.5 -3.62 +123.6 4. -.002 0.0 123 43445 +17 -98 0
  280.     global stream_len
  281.     if stream_len == None:
  282.         stream_len = int(float(t.value))
  283.     return t
  284.  
  285.  
  286. #7.5.2      File Header
  287. #The first line of a PDF file shall be a header consisting of the 5 characters %PDF-
  288. #followed by a version number of the form 1.N, where N is a digit between 0 and 7.
  289. def t_HEADER(t):
  290.     r'%PDF-1\.[0-7]'
  291.     t.value = t.value[-3:]
  292.     return t
  293.    
  294. #7.5.4     Cross-Reference Table
  295. #Each cross-reference section shall begin with a line containing the keyword
  296. #xref. Following this line shall be one or more cross-reference subsections,
  297. #which may appear in any order.
  298. @TOKEN(r'xref[' + white_spaces_r +']*'+eol)
  299. def t_XREF(t):
  300.     t.lexer.push_state('xref')    
  301.     t.lexer.xref = []
  302.     t.lexer.xref_start = t.lexpos
  303.    
  304. def t_xref_XREFENTRY(t):
  305.     r'\d{10}[ ]\d{5}[ ][nf](\x20\x0D|\x20\x0A|\x0D\x0A)'
  306.     n = t.value.strip().split(" ")
  307.     t.lexer.xref[len(t.lexer.xref)-1][1].append((int(n[0],10), int(n[1],10), n[2]))
  308.  
  309. #EXAMPLE 1 The following line introduces a subsection containing five objects
  310. #numbered consecutively from 28 to 32.
  311. #          28 5
  312. @TOKEN(r'[0-9]+[ ][0-9]+[' + white_spaces_r +']*'+eol)
  313. def t_xref_SUBXREF(t):
  314.     n = t.value.split(" ")
  315.     t.lexer.xref.append(((int(n[0],10),int(n[1],10)),[]))
  316.    
  317. def t_xref_out(t):
  318.     r'.'
  319.     t.lexer.pop_state()  
  320.     t.type = 'XREF'
  321.     t.value = t.lexer.xref
  322.     t.lexer.lexpos -= 1
  323.     t.lexpos=t.lexer.xref_start
  324.     return t
  325.  
  326. #TODO: Log, increment a warning counter, or even dismiss the file  
  327. def t_xref_error(t):
  328.     print "XREF Error"
  329.     t.lexer.skip(1)
  330.  
  331.  
  332. #7.5.5      File Trailer
  333. #The trailer of a PDF file enables a conforming reader to quickly find the
  334. #cross-reference table and certain special objects. Conforming readers
  335. #should read a PDF file from its end. The last line of the file shall contain
  336. #only the end-of-file marker, %%EOF. The two preceding lines shall contain,
  337. #one per line and in order, the keyword startxref and the byte offset in the
  338. #decoded stream from the beginning of the file to the beginning of the xref
  339. #keyword in the last cross-reference section. The startxref line shall be
  340. #preceded by the trailer dictionary, consisting of the keyword trailer followed
  341. #by a series of key-value pairs enclosed in double anglebrackets (<< ... >>).
  342. #Thus, the trailer has the following overall structure:
  343. #       trailer
  344. #           << key1 value1
  345. #                key2 value2
  346. #                ...
  347. #                keyn valuen
  348. #           >>
  349. #       startxref
  350. #       Byte_offset_of_last_cross-reference_section
  351. #       %%EOF
  352. t_TRAILER = r'trailer'
  353.  
  354. @TOKEN(r'startxref'+ '['+white_spaces_r+']+[0-9]+')
  355. def t_STARTXREF(t):
  356.     t.value = int(t.value[10:],10)
  357.     return t
  358.    
  359. #FYI: Probably trying to fix some ill transmitted pdfs some
  360. #readers look for this marker in the las 1k bytes of the file
  361. t_EOF = r'%%EOF'
  362.  
  363. #ignore the comments
  364. def t_ignore_COMMENT(t):
  365.     r'%[^\n\r]*[\n\r]'
  366.     if t.value.startswith("%%EOF"):
  367.         t.type = 'EOF'
  368.         return t
  369.  
  370. #Damn! A lexing error!!
  371. #TODO: Log, increment a warning counter, or even dismiss the file  
  372. def t_error(t):
  373.     print "ERROR:lexer: MAIN erris (Pos:%d):"%t.lexer.lexpos, t.lexer.lexdata[t.lexer.lexpos:][:10]
  374.     t.lexer.skip(1)
  375.  
  376. t_ignore = white_spaces
  377.  
  378. # Build the lexer
  379. lex.lex(optimize=True)
  380. #lex.lex(debug=True)
  381. import zlib
  382. if __name__ == '__main__':
  383.     try:
  384.         import psyco
  385.         psyco.full()
  386.     except:
  387.         pass
  388.        
  389.     bytes = 0
  390.     files = 0
  391.     for filename in sys.argv[1:]:
  392.         try:
  393.             s = file(filename,"r").read()
  394.             files += 1
  395.             bytes += len(s)
  396.             # Give the lexer some input
  397.             lex.input(s)
  398.             print filename
  399.             # Tokenize
  400.             while True:
  401.                 tok = lex.token()
  402.                 if not tok: break      # No more input
  403.                
  404.                 print tok
  405.            
  406.         except:
  407.             print e, filename
  408.             print dir(e)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement