###################s################################################# ## felipe.andres.manzano@gmail.com http://feliam.wordpress.com/ ## ## twitter.com/feliam http://www.linkedin.com/in/fmanzano ## #################################################################### # PDF scanner/tokenizer import sys import ply.lex as lex TOKEN = lex.TOKEN #Terrible hack to mitigate the streams containing "endstream" tokens thing stream_len = None # Tokens tokens = ('HEXSTRING','STRING', 'NUMBER', 'NAME', 'LEFT_SQUARE_BRACKET', 'RIGHT_SQUARE_BRACKET', 'NULL', 'TRUE', 'FALSE', 'R', 'DOUBLE_LESS_THAN_SIGN', 'DOUBLE_GREATER_THAN_SIGN', 'STREAM_DATA', 'OBJ', 'ENDOBJ', 'HEADER', 'TRAILER', 'EOF', 'STARTXREF' , 'XREF' ) #different lexers used.. states = ( ('string', 'exclusive'), ('name', 'exclusive'), ('xref', 'exclusive'), ) #7.2.2 Character Set #7.2.2 Character Set #The PDF character set is divided into three classes, called regular, #delimiter, and white-space characters. This classification determines #the grouping of characters into tokens. The rules defined in this #sub-clause apply to all characters in the file except within strings, #streams, and comments. white_spaces_r = r"\x20\r\n\t\x0c\x00" white_spaces = "\x20\r\n\t\x0c\x00" #The delimiter characters (, ), <, >, [, ], {, }, /, and % delimiters = r"()<>[]/%" #This is odd: {} ? delimiters_r = r"()<>\[\]/%" #This is odd: {} ? #The CARRIAGE RETURN (0Dh) and LINE FEED (0Ah) characters, also called #newline characters, shall be treated as end-of-line (EOL) markers. The #combination of a CARRIAGE RETURN followed immediately by a LINE FEED #shall be treated as one EOL marker. eol = r'(\r|\n|\r\n)' ######################################################################### #INITIAL lexer #7.3.2 Boolean Objects #Boolean objects represent the logical values of true and false. They appear #in PDF files using the keywords true and false. t_TRUE = "true" t_FALSE = "false" ################################################################################# #string lexer #7.3.4.2 Literal Strings #A literal string shall be written as an arbitrary number of characters #enclosed in parentheses. Any characters may appear in a string except #unbalanced parentheses and the backslash, which shall be treated #specially as described in this sub-clause. Balanced pairs of #parentheses within a string require no special treatment. #EXAMPLE 1 The following are valid literal strings: # ( This is a string ) # ( Strings may contain newlines # and such . ) # ( Strings may contain balanced parentheses ( ) and # special characters ( * ! & } ^ % and so on ) . ) # ( The following is an empty string . ) # () # ( It has zero ( 0 ) length . ) #An end-of-line marker appearing within a literal string without a #preceding REVERSE SOLIDUS shall be treated as a byte value of (0Ah), #irrespective of whether the end-of-line marker was a CARRIAGE RETURN #(0Dh), a LINE FEED (0Ah), or both. @TOKEN(eol) def t_string_LITERAL_STRING_EOL(t): t.lexer.string += "\x0A" @TOKEN(r'\\([nrtbf()\\]|[0-7]{1,3}|'+eol+')') def t_string_ESCAPED_SEQUENCE(t): val = t.value[1:] if val[0] in '0123': value = chr(int(val,8)) elif val[0] in '4567': value = chr(int(val[:2],8)) + val[3:] else: value = { "\n": "", "\r": "", "n": "\n", "r": "\r", "t": "\t", "b": "\b", "f": "\f", "(": "(", ")": ")", "\\": "\\" }[val[0]] t.lexer.string += value #PDF string insanity.. def t_string_LEFT_PARENTHESIS(t): r"\(" t.lexer.push_state('string') t.lexer.string += "(" def t_string_RIGHT_PARENTHESIS(t): r"\)" t.lexer.pop_state() if t.lexer.current_state() == 'string': t.lexer.string += ")" else: t.type = "STRING" t.value = t.lexer.string return t def t_string_LITERAL_STRING_CHAR(t): r'.' t.lexer.string += t.value #TODO: Log, increment a warning counter, or even dismiss the file def t_string_error(t): print "Error scanning a literal string at %d\n"%t.lexer.lexpos t.type = "STRING" t.value = t.lexer.string t.lexer.skip(1) return t def t_STRING(t): r"\(" t.lexer.push_state('string') t.lexer.string = "" #7.3.4.3 Hexadecimal Strings #Strings may also be written in hexadecimal form, which is useful for #including arbitrary binary data in a PDF file.A hexadecimal string shall #be written as a sequence of hexadecimal digits (0-9 and either A-F or a-f) #encoded as ASCII characters and enclosed within angle brackets < and >. #EXAMPLE 1 < 4E6F762073686D6F7A206B6120706F702E > #Each pair of hexadecimal digits defines one byte of the string. White-space #characters shall be ignored. If the final digit of a hexadecimal string is #missing -that is, if there is an odd number of digits- the final digit shall be #assumed to be 0. @TOKEN(r'<[a-fA-F0-9'+white_spaces_r+']*>') def t_HEXSTRING(t): t.value = ''.join([c for c in t.value if c not in white_spaces+"<>"]) t.value = (t.value+('0'*(len(t.value)%2))).decode('hex') return t #7.3.5 Name Objects #Beginning with PDF 1.2 a name object is an atomic symbol uniquely #defined by a sequence of any characters (8-bit values) except null #(character code 0). # #When writing a name in a PDF file, a SOLIDUS (2Fh) (/) shall be used to #introduce a name. The SOLIDUS is not part of the name but is a prefix #indicating that what follows is a sequence of characters representing #the name in the PDF file and shall follow these rules: #a) A NUMBER SIGN (23h) (#) in a name shall be written by using its # 2-digit hexadecimal code (23), preceded by the NUMBER SIGN. #b) Any character in a name that is a regular character (other than # NUMBER SIGN) shall be written as itself or by using its 2-digit # hexadecimal code, preceded by the NUMBER SIGN. #c) Any character that is not a regular character shall be written using # its 2-digit hexadecimal code, preceded by the NUMBER SIGN only. def t_NAME(t): r'/' t.lexer.push_state('name') t.lexer.name = "" t.lexer.start = t.lexpos def t_name_HEXCHAR(t): r'\#[0-9a-fA-F]{2}' #Beginning with PDF 1.2 a name object is an atomic symbol uniquely #defined by a sequence of any characters (8-bit values) except null (character code 0). assert t.value != "#00" t.lexer.name += t.value[1:].decode('hex') @TOKEN(r'[^'+white_spaces_r+delimiters_r+']') def t_name_NAMECHAR(t): t.lexer.name += t.value @TOKEN(r'['+white_spaces_r+delimiters_r+']') def t_name_WHITESPACE(t): global stream_len t.lexer.pop_state() t.lexer.lexpos -= 1 t.lexpos = t.lexer.start t.type = "NAME" t.value = t.lexer.name t.lexer.name="" if t.value == "Length": stream_len = None return t def t_name_error(t): print "Name error",t.lexer.lexpos, t.lexer.name #7.3.6 Array Objects #An array shall be written as a sequence of objects enclosed in [ and ]. #EXAMPLE [ 549 3.14 false ( Ralph ) /SomeName ] t_LEFT_SQUARE_BRACKET = r"\[" t_RIGHT_SQUARE_BRACKET = r"\]" #7.3.7 Dictionary Objects #A dictionary shall be written as a sequence of key-value pairs #enclosed in double angle brackets (<< ... >>) t_DOUBLE_LESS_THAN_SIGN = r'<<' t_DOUBLE_GREATER_THAN_SIGN = r'>>' ############################################################################ #7.3.8 Stream Objects #A stream object, like a string object, is a sequence of bytes. A stream #shall consist of a dictionary followed by zero or more bytes bracketed between #the keywords stream(followed by newline) and endstream #The keyword stream that follows the stream dictionary shall be followed by an #end-of-line marker consisting of either a CARRIAGE RETURN and a LINE FEED or #just a LINE FEED, and not by a CARRIAGE RETURN alone. def t_STREAM_DATA(t): r'stream(\r\n|\n)' global stream_len if stream_len and stream_len > 0: found = t.lexer.lexdata.find('endstream',t.lexer.lexpos+stream_len) else: found = t.lexer.lexdata.find('endstream',t.lexer.lexpos) stream_len = None if found != -1: chop = 0 if t.lexer.lexdata[found-3] == '\r': chop = {'\r':1, '\n':2}[t.lexer.lexdata[found-2]] elif t.lexer.lexdata[found-2] in ['\n','\r']: chop = 1 else: #TODO log errors #print "Warning in endstream" pass t.value = t.lexer.lexdata[t.lexer.lexpos: found -1 - chop] t.lexer.lexpos = found + 9 t.type = "STREAM_DATA" else: raise Exception("Error:Parsing:Lexer: COuld not found endstream string.") return t #7.3.9 Null Object #The null object has a type and value that are unequal to those of any #other object. There shall be only one object of type null, denoted by #the keyword null. t_NULL = r'null' #7.3.10 Indirect Objects #Any object in a PDF file may be labelled as an indirect object.The #definition of an indirect object in a PDF file shall consist of its #object number and generation number(separated by white space), #followed by the value of the object bracketed between the keywords #obj and endobj. def t_OBJ(t): r'\d+\x20\d+\x20obj' #[0-9]{1,10} [0-9]+ obj' t.value = tuple(t.value.split("\x20")[:2]) return t t_ENDOBJ = r'endobj' #The object may be referred to from elsewhere in the file by an indirect #reference. Such indirect references shall consist of the object number, #the generation number, and the keyword R (with white space separating each #part): #EXAMPLE 12 0 R def t_R(t): r'\d+\x20\d+\x20R' t.value = tuple([int(x,10) for x in t.value.split("\x20")[:2] ]) return t #7.3.3 Numeric Objects #PDF provides two types of numeric objects: integer and real. Integer objects #represent mathematical integers. Real objects represent mathematical real numbers. def t_NUMBER(t): r'[+-]{0,1}(\d*\.\d+|\d+\.\d*|\d+)' #34.5 -3.62 +123.6 4. -.002 0.0 123 43445 +17 -98 0 global stream_len if stream_len == None: stream_len = int(float(t.value)) return t #7.5.2 File Header #The first line of a PDF file shall be a header consisting of the 5 characters %PDF- #followed by a version number of the form 1.N, where N is a digit between 0 and 7. def t_HEADER(t): r'%PDF-1\.[0-7]' t.value = t.value[-3:] return t #7.5.4 Cross-Reference Table #Each cross-reference section shall begin with a line containing the keyword #xref. Following this line shall be one or more cross-reference subsections, #which may appear in any order. @TOKEN(r'xref[' + white_spaces_r +']*'+eol) def t_XREF(t): t.lexer.push_state('xref') t.lexer.xref = [] t.lexer.xref_start = t.lexpos def t_xref_XREFENTRY(t): r'\d{10}[ ]\d{5}[ ][nf](\x20\x0D|\x20\x0A|\x0D\x0A)' n = t.value.strip().split(" ") t.lexer.xref[len(t.lexer.xref)-1][1].append((int(n[0],10), int(n[1],10), n[2])) #EXAMPLE 1 The following line introduces a subsection containing five objects #numbered consecutively from 28 to 32. # 28 5 @TOKEN(r'[0-9]+[ ][0-9]+[' + white_spaces_r +']*'+eol) def t_xref_SUBXREF(t): n = t.value.split(" ") t.lexer.xref.append(((int(n[0],10),int(n[1],10)),[])) def t_xref_out(t): r'.' t.lexer.pop_state() t.type = 'XREF' t.value = t.lexer.xref t.lexer.lexpos -= 1 t.lexpos=t.lexer.xref_start return t #TODO: Log, increment a warning counter, or even dismiss the file def t_xref_error(t): print "XREF Error" t.lexer.skip(1) #7.5.5 File Trailer #The trailer of a PDF file enables a conforming reader to quickly find the #cross-reference table and certain special objects. Conforming readers #should read a PDF file from its end. The last line of the file shall contain #only the end-of-file marker, %%EOF. The two preceding lines shall contain, #one per line and in order, the keyword startxref and the byte offset in the #decoded stream from the beginning of the file to the beginning of the xref #keyword in the last cross-reference section. The startxref line shall be #preceded by the trailer dictionary, consisting of the keyword trailer followed #by a series of key-value pairs enclosed in double anglebrackets (<< ... >>). #Thus, the trailer has the following overall structure: # trailer # << key1 value1 # key2 value2 # ... # keyn valuen # >> # startxref # Byte_offset_of_last_cross-reference_section # %%EOF t_TRAILER = r'trailer' @TOKEN(r'startxref'+ '['+white_spaces_r+']+[0-9]+') def t_STARTXREF(t): t.value = int(t.value[10:],10) return t #FYI: Probably trying to fix some ill transmitted pdfs some #readers look for this marker in the las 1k bytes of the file t_EOF = r'%%EOF' #ignore the comments def t_ignore_COMMENT(t): r'%[^\n\r]*[\n\r]' if t.value.startswith("%%EOF"): t.type = 'EOF' return t #Damn! A lexing error!! #TODO: Log, increment a warning counter, or even dismiss the file def t_error(t): print "ERROR:lexer: MAIN erris (Pos:%d):"%t.lexer.lexpos, t.lexer.lexdata[t.lexer.lexpos:][:10] t.lexer.skip(1) t_ignore = white_spaces # Build the lexer lex.lex(optimize=True) #lex.lex(debug=True) import zlib if __name__ == '__main__': try: import psyco psyco.full() except: pass bytes = 0 files = 0 for filename in sys.argv[1:]: try: s = file(filename,"r").read() files += 1 bytes += len(s) # Give the lexer some input lex.input(s) print filename # Tokenize while True: tok = lex.token() if not tok: break # No more input print tok except: print e, filename print dir(e)