SHARE
TWEET

Untitled

a guest Aug 6th, 2010 706 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ###################s#################################################
  2. ## felipe.andres.manzano@gmail.com  http://feliam.wordpress.com/  ##
  3. ## twitter.com/feliam        http://www.linkedin.com/in/fmanzano  ##
  4. ####################################################################
  5. # PDF scanner/tokenizer
  6.  
  7. import sys
  8. import ply.lex as lex
  9. TOKEN = lex.TOKEN
  10.  
  11. #Terrible hack to mitigate the streams containing "endstream" tokens thing
  12. stream_len = None
  13.  
  14. # Tokens
  15. tokens = ('HEXSTRING','STRING', 'NUMBER', 'NAME', 'LEFT_SQUARE_BRACKET', 'RIGHT_SQUARE_BRACKET',
  16.           'NULL', 'TRUE', 'FALSE', 'R', 'DOUBLE_LESS_THAN_SIGN', 'DOUBLE_GREATER_THAN_SIGN',
  17.           'STREAM_DATA', 'OBJ', 'ENDOBJ', 'HEADER', 'TRAILER', 'EOF', 'STARTXREF' ,
  18.           'XREF' )
  19.  
  20. #different lexers used..
  21. states = ( ('string', 'exclusive'),
  22.            ('name', 'exclusive'),
  23.            ('xref', 'exclusive'),
  24.          )
  25.  
  26. #7.2.2 Character Set
  27. #7.2.2      Character Set
  28. #The PDF character set is divided into three classes, called regular,
  29. #delimiter, and white-space characters. This classification determines
  30. #the grouping of characters into tokens. The rules defined in this
  31. #sub-clause apply to all characters in the file except within strings,
  32.  
  33. #streams, and comments.
  34. white_spaces_r = r"\x20\r\n\t\x0c\x00"
  35. white_spaces = "\x20\r\n\t\x0c\x00"
  36.  
  37. #The delimiter characters (, ), <, >, [, ], {, }, /, and %
  38. delimiters = r"()<>[]/%" #This is odd: {} ?
  39. delimiters_r = r"()<>\[\]/%" #This is odd: {} ?
  40.  
  41. #The CARRIAGE RETURN (0Dh) and LINE FEED (0Ah) characters, also called
  42. #newline characters, shall be treated as end-of-line (EOL) markers. The
  43. #combination of a CARRIAGE RETURN followed immediately by a LINE FEED
  44. #shall be treated as one EOL marker.
  45. eol = r'(\r|\n|\r\n)'
  46.  
  47.  
  48. #########################################################################
  49. #INITIAL lexer
  50.  
  51. #7.3.2 Boolean Objects
  52. #Boolean objects represent the logical values of true and false. They appear
  53. #in PDF files using the keywords true and false.
  54. t_TRUE = "true"
  55. t_FALSE = "false"
  56.  
  57. #################################################################################
  58. #string lexer
  59. #7.3.4.2    Literal Strings
  60.  
  61. #A literal string shall be written as an arbitrary number of characters
  62. #enclosed in parentheses. Any characters may appear in a string except
  63. #unbalanced parentheses and the backslash, which shall be treated
  64. #specially as described in this sub-clause. Balanced pairs of
  65. #parentheses within a string require no special treatment.
  66.  
  67. #EXAMPLE 1        The following are valid literal strings:
  68. #                 ( This is a string )
  69. #                 ( Strings may contain newlines
  70. #                 and such . )
  71. #                 ( Strings may contain balanced parentheses ( ) and
  72. #                 special characters ( * ! & } ^ % and so on ) . )
  73. #                 ( The following is an empty string . )
  74. #                 ()
  75. #                 ( It has zero ( 0 ) length . )
  76.  
  77.  
  78. #An end-of-line marker appearing within a literal string without a
  79. #preceding REVERSE SOLIDUS shall be treated as a byte value of (0Ah),
  80. #irrespective of whether the end-of-line marker was a CARRIAGE RETURN
  81. #(0Dh), a LINE FEED (0Ah), or both.
  82. @TOKEN(eol)
  83. def t_string_LITERAL_STRING_EOL(t):
  84.     t.lexer.string += "\x0A"
  85.    
  86. @TOKEN(r'\\([nrtbf()\\]|[0-7]{1,3}|'+eol+')')    
  87. def t_string_ESCAPED_SEQUENCE(t):
  88.     val = t.value[1:]
  89.     if val[0] in '0123':
  90.         value = chr(int(val,8))
  91.     elif val[0] in '4567':
  92.         value = chr(int(val[:2],8)) + val[3:]
  93.     else:  
  94.         value = { "\n": "", "\r": "", "n": "\n", "r": "\r", "t": "\t", "b": "\b", "f": "\f", "(": "(", ")": ")", "\\": "\\" }[val[0]]
  95.     t.lexer.string += value
  96.  
  97. #PDF string insanity..
  98. def t_string_LEFT_PARENTHESIS(t):
  99.     r"\("
  100.     t.lexer.push_state('string')
  101.     t.lexer.string += "("
  102.    
  103. def t_string_RIGHT_PARENTHESIS(t):
  104.     r"\)"
  105.     t.lexer.pop_state()
  106.     if t.lexer.current_state() == 'string':
  107.         t.lexer.string += ")"
  108.     else:
  109.         t.type  = "STRING"
  110.         t.value = t.lexer.string
  111.         return t
  112.        
  113. def t_string_LITERAL_STRING_CHAR(t):
  114.     r'.'
  115.     t.lexer.string += t.value
  116.  
  117. #TODO: Log, increment a warning counter, or even dismiss the file  
  118. def t_string_error(t):
  119.     print "Error scanning a literal string at %d\n"%t.lexer.lexpos
  120.     t.type  = "STRING"
  121.     t.value = t.lexer.string
  122.     t.lexer.skip(1)
  123.     return t
  124.    
  125. def t_STRING(t):
  126.     r"\("
  127.     t.lexer.push_state('string')
  128.     t.lexer.string = ""
  129.    
  130. #7.3.4.3    Hexadecimal Strings
  131. #Strings may also be written in hexadecimal form, which is useful for
  132. #including arbitrary binary data in a PDF file.A hexadecimal string shall
  133. #be written as a sequence of hexadecimal digits (0-9 and either A-F or a-f)
  134. #encoded as ASCII characters and enclosed within angle brackets < and >.
  135. #EXAMPLE 1          < 4E6F762073686D6F7A206B6120706F702E >
  136. #Each pair of hexadecimal digits defines one byte of the string. White-space
  137. #characters shall be ignored. If the final digit of a hexadecimal string is
  138. #missing -that is, if there is an odd number of digits- the final digit shall be
  139. #assumed to be 0.
  140.  
  141. @TOKEN(r'<[a-fA-F0-9'+white_spaces_r+']*>')
  142. def t_HEXSTRING(t):
  143.     t.value =  ''.join([c for c in t.value if c not in white_spaces+"<>"])
  144.     t.value =  (t.value+('0'*(len(t.value)%2))).decode('hex')
  145.     return t
  146.  
  147.    
  148. #7.3.5      Name Objects
  149. #Beginning with PDF 1.2 a name object is an atomic symbol uniquely
  150. #defined by a sequence of any characters (8-bit values) except null
  151. #(character code 0).
  152. #
  153. #When writing a name in a PDF file, a SOLIDUS (2Fh) (/) shall be used to
  154. #introduce a name. The SOLIDUS is not part of the name but is a prefix
  155. #indicating that what follows is a sequence of characters representing
  156. #the name in the PDF file and shall follow these rules:
  157. #a)  A NUMBER SIGN (23h) (#) in a name shall be written by using its
  158. #    2-digit hexadecimal code (23), preceded     by the NUMBER SIGN.
  159. #b)  Any character in a name that is a regular character (other than
  160. #    NUMBER SIGN) shall be written as itself or by using its 2-digit
  161. #    hexadecimal code, preceded by the NUMBER SIGN.
  162. #c)  Any character that is not a regular character shall be written using
  163. #    its 2-digit hexadecimal code, preceded     by the NUMBER SIGN only.
  164.  
  165. def t_NAME(t):
  166.     r'/'
  167.     t.lexer.push_state('name')    
  168.     t.lexer.name = ""
  169.     t.lexer.start = t.lexpos
  170.  
  171. def t_name_HEXCHAR(t):
  172.     r'\#[0-9a-fA-F]{2}'
  173.     #Beginning with PDF 1.2 a name object is an atomic symbol uniquely
  174.     #defined by a sequence of any characters (8-bit values) except null (character code 0).
  175.     assert t.value != "#00"
  176.     t.lexer.name += t.value[1:].decode('hex')
  177.  
  178. @TOKEN(r'[^'+white_spaces_r+delimiters_r+']')
  179. def t_name_NAMECHAR(t):
  180.     t.lexer.name += t.value
  181.    
  182.    
  183. @TOKEN(r'['+white_spaces_r+delimiters_r+']')
  184. def t_name_WHITESPACE(t):
  185.     global stream_len
  186.     t.lexer.pop_state()
  187.     t.lexer.lexpos -= 1
  188.     t.lexpos = t.lexer.start
  189.     t.type  = "NAME"
  190.     t.value = t.lexer.name
  191.     t.lexer.name=""
  192.     if t.value == "Length":
  193.         stream_len = None
  194.     return t
  195.  
  196. def t_name_error(t):
  197.     print "Name error",t.lexer.lexpos, t.lexer.name
  198.  
  199. #7.3.6 Array Objects
  200. #An array shall be written as a sequence of objects enclosed in [ and ].
  201. #EXAMPLE         [ 549 3.14 false ( Ralph ) /SomeName ]
  202. t_LEFT_SQUARE_BRACKET = r"\["
  203. t_RIGHT_SQUARE_BRACKET = r"\]"
  204.  
  205. #7.3.7      Dictionary Objects
  206. #A dictionary shall be written as a sequence of key-value pairs
  207. #enclosed in double angle brackets (<< ... >>)
  208. t_DOUBLE_LESS_THAN_SIGN = r'<<'
  209. t_DOUBLE_GREATER_THAN_SIGN = r'>>'
  210.  
  211. ############################################################################
  212. #7.3.8     Stream Objects
  213. #A stream object, like a string object, is a sequence of bytes. A stream
  214. #shall consist of a dictionary followed by zero or more bytes bracketed between
  215. #the keywords stream(followed by newline) and endstream
  216.  
  217. #The keyword stream that follows the stream dictionary shall be followed by an
  218. #end-of-line marker consisting of either a CARRIAGE RETURN and a LINE FEED or
  219. #just a LINE FEED, and not by a CARRIAGE RETURN alone.
  220. def t_STREAM_DATA(t):
  221.     r'stream(\r\n|\n)'
  222.     global stream_len
  223.     if stream_len and stream_len > 0:
  224.         found = t.lexer.lexdata.find('endstream',t.lexer.lexpos+stream_len)
  225.     else:
  226.         found = t.lexer.lexdata.find('endstream',t.lexer.lexpos)
  227.     stream_len = None
  228.    
  229.     if found != -1:
  230.         chop = 0
  231.  
  232.         if t.lexer.lexdata[found-3] == '\r':
  233.             chop = {'\r':1, '\n':2}[t.lexer.lexdata[found-2]]
  234.         elif t.lexer.lexdata[found-2] in ['\n','\r']:
  235.             chop = 1
  236.         else:
  237.             #TODO log errors
  238.             #print "Warning in endstream"
  239.             pass
  240.         t.value = t.lexer.lexdata[t.lexer.lexpos: found -1 - chop]
  241.         t.lexer.lexpos = found + 9
  242.         t.type  = "STREAM_DATA"
  243.     else:
  244.         raise Exception("Error:Parsing:Lexer: COuld not found endstream string.")
  245.     return t
  246.  
  247. #7.3.9      Null Object
  248. #The null object has a type and value that are unequal to those of any
  249. #other object. There shall be only one object of type null, denoted by
  250. #the keyword null.
  251. t_NULL = r'null'
  252.  
  253. #7.3.10   Indirect Objects
  254. #Any object in a PDF file may be labelled as an indirect object.The
  255. #definition of an indirect object in a PDF file shall consist of its
  256. #object number and generation number(separated by white space),
  257. #followed by the value of the object bracketed between the keywords
  258. #obj and endobj.
  259. def t_OBJ(t):
  260.     r'\d+\x20\d+\x20obj' #[0-9]{1,10} [0-9]+ obj'
  261.     t.value = tuple(t.value.split("\x20")[:2])
  262.     return t
  263. t_ENDOBJ = r'endobj'
  264.  
  265. #The object may be referred to from elsewhere in the file by an indirect
  266. #reference. Such indirect references shall consist of the object number,
  267. #the generation number, and the keyword R (with white space separating each
  268. #part):
  269. #EXAMPLE     12 0 R
  270. def t_R(t):
  271.     r'\d+\x20\d+\x20R'
  272.     t.value = tuple([int(x,10) for x in t.value.split("\x20")[:2] ])
  273.     return t
  274.    
  275. #7.3.3 Numeric Objects
  276. #PDF provides two types of numeric objects: integer and real. Integer objects
  277. #represent mathematical integers. Real objects represent mathematical real numbers.
  278. def t_NUMBER(t):
  279.     r'[+-]{0,1}(\d*\.\d+|\d+\.\d*|\d+)' #34.5 -3.62 +123.6 4. -.002 0.0 123 43445 +17 -98 0
  280.     global stream_len
  281.     if stream_len == None:
  282.         stream_len = int(float(t.value))
  283.     return t
  284.  
  285.  
  286. #7.5.2      File Header
  287. #The first line of a PDF file shall be a header consisting of the 5 characters %PDF-
  288. #followed by a version number of the form 1.N, where N is a digit between 0 and 7.
  289. def t_HEADER(t):
  290.     r'%PDF-1\.[0-7]'
  291.     t.value = t.value[-3:]
  292.     return t
  293.    
  294. #7.5.4     Cross-Reference Table
  295. #Each cross-reference section shall begin with a line containing the keyword
  296. #xref. Following this line shall be one or more cross-reference subsections,
  297. #which may appear in any order.
  298. @TOKEN(r'xref[' + white_spaces_r +']*'+eol)
  299. def t_XREF(t):
  300.     t.lexer.push_state('xref')    
  301.     t.lexer.xref = []
  302.     t.lexer.xref_start = t.lexpos
  303.    
  304. def t_xref_XREFENTRY(t):
  305.     r'\d{10}[ ]\d{5}[ ][nf](\x20\x0D|\x20\x0A|\x0D\x0A)'
  306.     n = t.value.strip().split(" ")
  307.     t.lexer.xref[len(t.lexer.xref)-1][1].append((int(n[0],10), int(n[1],10), n[2]))
  308.  
  309. #EXAMPLE 1 The following line introduces a subsection containing five objects
  310. #numbered consecutively from 28 to 32.
  311. #          28 5
  312. @TOKEN(r'[0-9]+[ ][0-9]+[' + white_spaces_r +']*'+eol)
  313. def t_xref_SUBXREF(t):
  314.     n = t.value.split(" ")
  315.     t.lexer.xref.append(((int(n[0],10),int(n[1],10)),[]))
  316.    
  317. def t_xref_out(t):
  318.     r'.'
  319.     t.lexer.pop_state()  
  320.     t.type = 'XREF'
  321.     t.value = t.lexer.xref
  322.     t.lexer.lexpos -= 1
  323.     t.lexpos=t.lexer.xref_start
  324.     return t
  325.  
  326. #TODO: Log, increment a warning counter, or even dismiss the file  
  327. def t_xref_error(t):
  328.     print "XREF Error"
  329.     t.lexer.skip(1)
  330.  
  331.  
  332. #7.5.5      File Trailer
  333. #The trailer of a PDF file enables a conforming reader to quickly find the
  334. #cross-reference table and certain special objects. Conforming readers
  335. #should read a PDF file from its end. The last line of the file shall contain
  336. #only the end-of-file marker, %%EOF. The two preceding lines shall contain,
  337. #one per line and in order, the keyword startxref and the byte offset in the
  338. #decoded stream from the beginning of the file to the beginning of the xref
  339. #keyword in the last cross-reference section. The startxref line shall be
  340. #preceded by the trailer dictionary, consisting of the keyword trailer followed
  341. #by a series of key-value pairs enclosed in double anglebrackets (<< ... >>).
  342. #Thus, the trailer has the following overall structure:
  343. #       trailer
  344. #           << key1 value1
  345. #                key2 value2
  346. #                ...
  347. #                keyn valuen
  348. #           >>
  349. #       startxref
  350. #       Byte_offset_of_last_cross-reference_section
  351. #       %%EOF
  352. t_TRAILER = r'trailer'
  353.  
  354. @TOKEN(r'startxref'+ '['+white_spaces_r+']+[0-9]+')
  355. def t_STARTXREF(t):
  356.     t.value = int(t.value[10:],10)
  357.     return t
  358.    
  359. #FYI: Probably trying to fix some ill transmitted pdfs some
  360. #readers look for this marker in the las 1k bytes of the file
  361. t_EOF = r'%%EOF'
  362.  
  363. #ignore the comments
  364. def t_ignore_COMMENT(t):
  365.     r'%[^\n\r]*[\n\r]'
  366.     if t.value.startswith("%%EOF"):
  367.         t.type = 'EOF'
  368.         return t
  369.  
  370. #Damn! A lexing error!!
  371. #TODO: Log, increment a warning counter, or even dismiss the file  
  372. def t_error(t):
  373.     print "ERROR:lexer: MAIN erris (Pos:%d):"%t.lexer.lexpos, t.lexer.lexdata[t.lexer.lexpos:][:10]
  374.     t.lexer.skip(1)
  375.  
  376. t_ignore = white_spaces
  377.  
  378. # Build the lexer
  379. lex.lex(optimize=True)
  380. #lex.lex(debug=True)
  381. import zlib
  382. if __name__ == '__main__':
  383.     try:
  384.         import psyco
  385.         psyco.full()
  386.     except:
  387.         pass
  388.        
  389.     bytes = 0
  390.     files = 0
  391.     for filename in sys.argv[1:]:
  392.         try:
  393.             s = file(filename,"r").read()
  394.             files += 1
  395.             bytes += len(s)
  396.             # Give the lexer some input
  397.             lex.input(s)
  398.             print filename
  399.             # Tokenize
  400.             while True:
  401.                 tok = lex.token()
  402.                 if not tok: break      # No more input
  403.                
  404.                 print tok
  405.            
  406.         except:
  407.             print e, filename
  408.             print dir(e)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top