Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ###################s#################################################
- ## felipe.andres.manzano@gmail.com http://feliam.wordpress.com/ ##
- ## twitter.com/feliam http://www.linkedin.com/in/fmanzano ##
- ####################################################################
- # PDF scanner/tokenizer
- import sys
- import ply.lex as lex
- TOKEN = lex.TOKEN
- #Terrible hack to mitigate the streams containing "endstream" tokens thing
- stream_len = None
- # Tokens
- tokens = ('HEXSTRING','STRING', 'NUMBER', 'NAME', 'LEFT_SQUARE_BRACKET', 'RIGHT_SQUARE_BRACKET',
- 'NULL', 'TRUE', 'FALSE', 'R', 'DOUBLE_LESS_THAN_SIGN', 'DOUBLE_GREATER_THAN_SIGN',
- 'STREAM_DATA', 'OBJ', 'ENDOBJ', 'HEADER', 'TRAILER', 'EOF', 'STARTXREF' ,
- 'XREF' )
- #different lexers used..
- states = ( ('string', 'exclusive'),
- ('name', 'exclusive'),
- ('xref', 'exclusive'),
- )
- #7.2.2 Character Set
- #7.2.2 Character Set
- #The PDF character set is divided into three classes, called regular,
- #delimiter, and white-space characters. This classification determines
- #the grouping of characters into tokens. The rules defined in this
- #sub-clause apply to all characters in the file except within strings,
- #streams, and comments.
- white_spaces_r = r"\x20\r\n\t\x0c\x00"
- white_spaces = "\x20\r\n\t\x0c\x00"
- #The delimiter characters (, ), <, >, [, ], {, }, /, and %
- delimiters = r"()<>[]/%" #This is odd: {} ?
- delimiters_r = r"()<>\[\]/%" #This is odd: {} ?
- #The CARRIAGE RETURN (0Dh) and LINE FEED (0Ah) characters, also called
- #newline characters, shall be treated as end-of-line (EOL) markers. The
- #combination of a CARRIAGE RETURN followed immediately by a LINE FEED
- #shall be treated as one EOL marker.
- eol = r'(\r|\n|\r\n)'
- #########################################################################
- #INITIAL lexer
- #7.3.2 Boolean Objects
- #Boolean objects represent the logical values of true and false. They appear
- #in PDF files using the keywords true and false.
- t_TRUE = "true"
- t_FALSE = "false"
- #################################################################################
- #string lexer
- #7.3.4.2 Literal Strings
- #A literal string shall be written as an arbitrary number of characters
- #enclosed in parentheses. Any characters may appear in a string except
- #unbalanced parentheses and the backslash, which shall be treated
- #specially as described in this sub-clause. Balanced pairs of
- #parentheses within a string require no special treatment.
- #EXAMPLE 1 The following are valid literal strings:
- # ( This is a string )
- # ( Strings may contain newlines
- # and such . )
- # ( Strings may contain balanced parentheses ( ) and
- # special characters ( * ! & } ^ % and so on ) . )
- # ( The following is an empty string . )
- # ()
- # ( It has zero ( 0 ) length . )
- #An end-of-line marker appearing within a literal string without a
- #preceding REVERSE SOLIDUS shall be treated as a byte value of (0Ah),
- #irrespective of whether the end-of-line marker was a CARRIAGE RETURN
- #(0Dh), a LINE FEED (0Ah), or both.
- @TOKEN(eol)
- def t_string_LITERAL_STRING_EOL(t):
- t.lexer.string += "\x0A"
- @TOKEN(r'\\([nrtbf()\\]|[0-7]{1,3}|'+eol+')')
- def t_string_ESCAPED_SEQUENCE(t):
- val = t.value[1:]
- if val[0] in '0123':
- value = chr(int(val,8))
- elif val[0] in '4567':
- value = chr(int(val[:2],8)) + val[3:]
- else:
- value = { "\n": "", "\r": "", "n": "\n", "r": "\r", "t": "\t", "b": "\b", "f": "\f", "(": "(", ")": ")", "\\": "\\" }[val[0]]
- t.lexer.string += value
- #PDF string insanity..
- def t_string_LEFT_PARENTHESIS(t):
- r"\("
- t.lexer.push_state('string')
- t.lexer.string += "("
- def t_string_RIGHT_PARENTHESIS(t):
- r"\)"
- t.lexer.pop_state()
- if t.lexer.current_state() == 'string':
- t.lexer.string += ")"
- else:
- t.type = "STRING"
- t.value = t.lexer.string
- return t
- def t_string_LITERAL_STRING_CHAR(t):
- r'.'
- t.lexer.string += t.value
- #TODO: Log, increment a warning counter, or even dismiss the file
- def t_string_error(t):
- print "Error scanning a literal string at %d\n"%t.lexer.lexpos
- t.type = "STRING"
- t.value = t.lexer.string
- t.lexer.skip(1)
- return t
- def t_STRING(t):
- r"\("
- t.lexer.push_state('string')
- t.lexer.string = ""
- #7.3.4.3 Hexadecimal Strings
- #Strings may also be written in hexadecimal form, which is useful for
- #including arbitrary binary data in a PDF file.A hexadecimal string shall
- #be written as a sequence of hexadecimal digits (0-9 and either A-F or a-f)
- #encoded as ASCII characters and enclosed within angle brackets < and >.
- #EXAMPLE 1 < 4E6F762073686D6F7A206B6120706F702E >
- #Each pair of hexadecimal digits defines one byte of the string. White-space
- #characters shall be ignored. If the final digit of a hexadecimal string is
- #missing -that is, if there is an odd number of digits- the final digit shall be
- #assumed to be 0.
- @TOKEN(r'<[a-fA-F0-9'+white_spaces_r+']*>')
- def t_HEXSTRING(t):
- t.value = ''.join([c for c in t.value if c not in white_spaces+"<>"])
- t.value = (t.value+('0'*(len(t.value)%2))).decode('hex')
- return t
- #7.3.5 Name Objects
- #Beginning with PDF 1.2 a name object is an atomic symbol uniquely
- #defined by a sequence of any characters (8-bit values) except null
- #(character code 0).
- #
- #When writing a name in a PDF file, a SOLIDUS (2Fh) (/) shall be used to
- #introduce a name. The SOLIDUS is not part of the name but is a prefix
- #indicating that what follows is a sequence of characters representing
- #the name in the PDF file and shall follow these rules:
- #a) A NUMBER SIGN (23h) (#) in a name shall be written by using its
- # 2-digit hexadecimal code (23), preceded by the NUMBER SIGN.
- #b) Any character in a name that is a regular character (other than
- # NUMBER SIGN) shall be written as itself or by using its 2-digit
- # hexadecimal code, preceded by the NUMBER SIGN.
- #c) Any character that is not a regular character shall be written using
- # its 2-digit hexadecimal code, preceded by the NUMBER SIGN only.
- def t_NAME(t):
- r'/'
- t.lexer.push_state('name')
- t.lexer.name = ""
- t.lexer.start = t.lexpos
- def t_name_HEXCHAR(t):
- r'\#[0-9a-fA-F]{2}'
- #Beginning with PDF 1.2 a name object is an atomic symbol uniquely
- #defined by a sequence of any characters (8-bit values) except null (character code 0).
- assert t.value != "#00"
- t.lexer.name += t.value[1:].decode('hex')
- @TOKEN(r'[^'+white_spaces_r+delimiters_r+']')
- def t_name_NAMECHAR(t):
- t.lexer.name += t.value
- @TOKEN(r'['+white_spaces_r+delimiters_r+']')
- def t_name_WHITESPACE(t):
- global stream_len
- t.lexer.pop_state()
- t.lexer.lexpos -= 1
- t.lexpos = t.lexer.start
- t.type = "NAME"
- t.value = t.lexer.name
- t.lexer.name=""
- if t.value == "Length":
- stream_len = None
- return t
- def t_name_error(t):
- print "Name error",t.lexer.lexpos, t.lexer.name
- #7.3.6 Array Objects
- #An array shall be written as a sequence of objects enclosed in [ and ].
- #EXAMPLE [ 549 3.14 false ( Ralph ) /SomeName ]
- t_LEFT_SQUARE_BRACKET = r"\["
- t_RIGHT_SQUARE_BRACKET = r"\]"
- #7.3.7 Dictionary Objects
- #A dictionary shall be written as a sequence of key-value pairs
- #enclosed in double angle brackets (<< ... >>)
- t_DOUBLE_LESS_THAN_SIGN = r'<<'
- t_DOUBLE_GREATER_THAN_SIGN = r'>>'
- ############################################################################
- #7.3.8 Stream Objects
- #A stream object, like a string object, is a sequence of bytes. A stream
- #shall consist of a dictionary followed by zero or more bytes bracketed between
- #the keywords stream(followed by newline) and endstream
- #The keyword stream that follows the stream dictionary shall be followed by an
- #end-of-line marker consisting of either a CARRIAGE RETURN and a LINE FEED or
- #just a LINE FEED, and not by a CARRIAGE RETURN alone.
- def t_STREAM_DATA(t):
- r'stream(\r\n|\n)'
- global stream_len
- if stream_len and stream_len > 0:
- found = t.lexer.lexdata.find('endstream',t.lexer.lexpos+stream_len)
- else:
- found = t.lexer.lexdata.find('endstream',t.lexer.lexpos)
- stream_len = None
- if found != -1:
- chop = 0
- if t.lexer.lexdata[found-3] == '\r':
- chop = {'\r':1, '\n':2}[t.lexer.lexdata[found-2]]
- elif t.lexer.lexdata[found-2] in ['\n','\r']:
- chop = 1
- else:
- #TODO log errors
- #print "Warning in endstream"
- pass
- t.value = t.lexer.lexdata[t.lexer.lexpos: found -1 - chop]
- t.lexer.lexpos = found + 9
- t.type = "STREAM_DATA"
- else:
- raise Exception("Error:Parsing:Lexer: COuld not found endstream string.")
- return t
- #7.3.9 Null Object
- #The null object has a type and value that are unequal to those of any
- #other object. There shall be only one object of type null, denoted by
- #the keyword null.
- t_NULL = r'null'
- #7.3.10 Indirect Objects
- #Any object in a PDF file may be labelled as an indirect object.The
- #definition of an indirect object in a PDF file shall consist of its
- #object number and generation number(separated by white space),
- #followed by the value of the object bracketed between the keywords
- #obj and endobj.
- def t_OBJ(t):
- r'\d+\x20\d+\x20obj' #[0-9]{1,10} [0-9]+ obj'
- t.value = tuple(t.value.split("\x20")[:2])
- return t
- t_ENDOBJ = r'endobj'
- #The object may be referred to from elsewhere in the file by an indirect
- #reference. Such indirect references shall consist of the object number,
- #the generation number, and the keyword R (with white space separating each
- #part):
- #EXAMPLE 12 0 R
- def t_R(t):
- r'\d+\x20\d+\x20R'
- t.value = tuple([int(x,10) for x in t.value.split("\x20")[:2] ])
- return t
- #7.3.3 Numeric Objects
- #PDF provides two types of numeric objects: integer and real. Integer objects
- #represent mathematical integers. Real objects represent mathematical real numbers.
- def t_NUMBER(t):
- r'[+-]{0,1}(\d*\.\d+|\d+\.\d*|\d+)' #34.5 -3.62 +123.6 4. -.002 0.0 123 43445 +17 -98 0
- global stream_len
- if stream_len == None:
- stream_len = int(float(t.value))
- return t
- #7.5.2 File Header
- #The first line of a PDF file shall be a header consisting of the 5 characters %PDF-
- #followed by a version number of the form 1.N, where N is a digit between 0 and 7.
- def t_HEADER(t):
- r'%PDF-1\.[0-7]'
- t.value = t.value[-3:]
- return t
- #7.5.4 Cross-Reference Table
- #Each cross-reference section shall begin with a line containing the keyword
- #xref. Following this line shall be one or more cross-reference subsections,
- #which may appear in any order.
- @TOKEN(r'xref[' + white_spaces_r +']*'+eol)
- def t_XREF(t):
- t.lexer.push_state('xref')
- t.lexer.xref = []
- t.lexer.xref_start = t.lexpos
- def t_xref_XREFENTRY(t):
- r'\d{10}[ ]\d{5}[ ][nf](\x20\x0D|\x20\x0A|\x0D\x0A)'
- n = t.value.strip().split(" ")
- t.lexer.xref[len(t.lexer.xref)-1][1].append((int(n[0],10), int(n[1],10), n[2]))
- #EXAMPLE 1 The following line introduces a subsection containing five objects
- #numbered consecutively from 28 to 32.
- # 28 5
- @TOKEN(r'[0-9]+[ ][0-9]+[' + white_spaces_r +']*'+eol)
- def t_xref_SUBXREF(t):
- n = t.value.split(" ")
- t.lexer.xref.append(((int(n[0],10),int(n[1],10)),[]))
- def t_xref_out(t):
- r'.'
- t.lexer.pop_state()
- t.type = 'XREF'
- t.value = t.lexer.xref
- t.lexer.lexpos -= 1
- t.lexpos=t.lexer.xref_start
- return t
- #TODO: Log, increment a warning counter, or even dismiss the file
- def t_xref_error(t):
- print "XREF Error"
- t.lexer.skip(1)
- #7.5.5 File Trailer
- #The trailer of a PDF file enables a conforming reader to quickly find the
- #cross-reference table and certain special objects. Conforming readers
- #should read a PDF file from its end. The last line of the file shall contain
- #only the end-of-file marker, %%EOF. The two preceding lines shall contain,
- #one per line and in order, the keyword startxref and the byte offset in the
- #decoded stream from the beginning of the file to the beginning of the xref
- #keyword in the last cross-reference section. The startxref line shall be
- #preceded by the trailer dictionary, consisting of the keyword trailer followed
- #by a series of key-value pairs enclosed in double anglebrackets (<< ... >>).
- #Thus, the trailer has the following overall structure:
- # trailer
- # << key1 value1
- # key2 value2
- # ...
- # keyn valuen
- # >>
- # startxref
- # Byte_offset_of_last_cross-reference_section
- # %%EOF
- t_TRAILER = r'trailer'
- @TOKEN(r'startxref'+ '['+white_spaces_r+']+[0-9]+')
- def t_STARTXREF(t):
- t.value = int(t.value[10:],10)
- return t
- #FYI: Probably trying to fix some ill transmitted pdfs some
- #readers look for this marker in the las 1k bytes of the file
- t_EOF = r'%%EOF'
- #ignore the comments
- def t_ignore_COMMENT(t):
- r'%[^\n\r]*[\n\r]'
- if t.value.startswith("%%EOF"):
- t.type = 'EOF'
- return t
- #Damn! A lexing error!!
- #TODO: Log, increment a warning counter, or even dismiss the file
- def t_error(t):
- print "ERROR:lexer: MAIN erris (Pos:%d):"%t.lexer.lexpos, t.lexer.lexdata[t.lexer.lexpos:][:10]
- t.lexer.skip(1)
- t_ignore = white_spaces
- # Build the lexer
- lex.lex(optimize=True)
- #lex.lex(debug=True)
- import zlib
- if __name__ == '__main__':
- try:
- import psyco
- psyco.full()
- except:
- pass
- bytes = 0
- files = 0
- for filename in sys.argv[1:]:
- try:
- s = file(filename,"r").read()
- files += 1
- bytes += len(s)
- # Give the lexer some input
- lex.input(s)
- print filename
- # Tokenize
- while True:
- tok = lex.token()
- if not tok: break # No more input
- print tok
- except:
- print e, filename
- print dir(e)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement