###################s#################################################
## felipe.andres.manzano@gmail.com  http://feliam.wordpress.com/  ##
## twitter.com/feliam        http://www.linkedin.com/in/fmanzano  ##
####################################################################
# PDF scanner/tokenizer

import sys
import ply.lex as lex
TOKEN = lex.TOKEN

#Terrible hack to mitigate the streams containing "endstream" tokens thing
stream_len = None

# Tokens
tokens = ('HEXSTRING','STRING', 'NUMBER', 'NAME', 'LEFT_SQUARE_BRACKET', 'RIGHT_SQUARE_BRACKET', 
          'NULL', 'TRUE', 'FALSE', 'R', 'DOUBLE_LESS_THAN_SIGN', 'DOUBLE_GREATER_THAN_SIGN',
          'STREAM_DATA', 'OBJ', 'ENDOBJ', 'HEADER', 'TRAILER', 'EOF', 'STARTXREF' , 
          'XREF' )

#different lexers used..
states = ( ('string', 'exclusive'),
           ('name', 'exclusive'),
           ('xref', 'exclusive'),
         )

#7.2.2 Character Set
#7.2.2      Character Set
#The PDF character set is divided into three classes, called regular,
#delimiter, and white-space characters. This classification determines
#the grouping of characters into tokens. The rules defined in this 
#sub-clause apply to all characters in the file except within strings, 

#streams, and comments.
white_spaces_r = r"\x20\r\n\t\x0c\x00"
white_spaces = "\x20\r\n\t\x0c\x00"

#The delimiter characters (, ), <, >, [, ], {, }, /, and %
delimiters = r"()<>[]/%" #This is odd: {} ?
delimiters_r = r"()<>\[\]/%" #This is odd: {} ?

#The CARRIAGE RETURN (0Dh) and LINE FEED (0Ah) characters, also called 
#newline characters, shall be treated as end-of-line (EOL) markers. The 
#combination of a CARRIAGE RETURN followed immediately by a LINE FEED 
#shall be treated as one EOL marker.
eol = r'(\r|\n|\r\n)'


#########################################################################
#INITIAL lexer

#7.3.2 Boolean Objects
#Boolean objects represent the logical values of true and false. They appear 
#in PDF files using the keywords true and false.
t_TRUE = "true"
t_FALSE = "false"

#################################################################################
#string lexer
#7.3.4.2    Literal Strings

#A literal string shall be written as an arbitrary number of characters
#enclosed in parentheses. Any characters may appear in a string except
#unbalanced parentheses and the backslash, which shall be treated
#specially as described in this sub-clause. Balanced pairs of
#parentheses within a string require no special treatment.

#EXAMPLE 1        The following are valid literal strings:
#                 ( This is a string )
#                 ( Strings may contain newlines
#                 and such . )
#                 ( Strings may contain balanced parentheses ( ) and
#                 special characters ( * ! & } ^ % and so on ) . )
#                 ( The following is an empty string . )
#                 ()
#                 ( It has zero ( 0 ) length . )


#An end-of-line marker appearing within a literal string without a 
#preceding REVERSE SOLIDUS shall be treated as a byte value of (0Ah),
#irrespective of whether the end-of-line marker was a CARRIAGE RETURN 
#(0Dh), a LINE FEED (0Ah), or both.
@TOKEN(eol)
def t_string_LITERAL_STRING_EOL(t):
    t.lexer.string += "\x0A"
    
@TOKEN(r'\\([nrtbf()\\]|[0-7]{1,3}|'+eol+')')    
def t_string_ESCAPED_SEQUENCE(t):
    val = t.value[1:]
    if val[0] in '0123':
        value = chr(int(val,8)) 
    elif val[0] in '4567':
        value = chr(int(val[:2],8)) + val[3:]
    else:   
        value = { "\n": "", "\r": "", "n": "\n", "r": "\r", "t": "\t", "b": "\b", "f": "\f", "(": "(", ")": ")", "\\": "\\" }[val[0]]
    t.lexer.string += value

#PDF string insanity..
def t_string_LEFT_PARENTHESIS(t):
    r"\("
    t.lexer.push_state('string')
    t.lexer.string += "("
    
def t_string_RIGHT_PARENTHESIS(t):
    r"\)"
    t.lexer.pop_state()
    if t.lexer.current_state() == 'string':
        t.lexer.string += ")"
    else:
        t.type  = "STRING"
        t.value = t.lexer.string
        return t
        
def t_string_LITERAL_STRING_CHAR(t):
    r'.'
    t.lexer.string += t.value

#TODO: Log, increment a warning counter, or even dismiss the file   
def t_string_error(t):
    print "Error scanning a literal string at %d\n"%t.lexer.lexpos
    t.type  = "STRING"
    t.value = t.lexer.string
    t.lexer.skip(1)
    return t
    
def t_STRING(t):
    r"\("
    t.lexer.push_state('string')
    t.lexer.string = ""
    
#7.3.4.3    Hexadecimal Strings
#Strings may also be written in hexadecimal form, which is useful for 
#including arbitrary binary data in a PDF file.A hexadecimal string shall
#be written as a sequence of hexadecimal digits (0-9 and either A-F or a-f)
#encoded as ASCII characters and enclosed within angle brackets < and >.
#EXAMPLE 1          < 4E6F762073686D6F7A206B6120706F702E >
#Each pair of hexadecimal digits defines one byte of the string. White-space 
#characters shall be ignored. If the final digit of a hexadecimal string is 
#missing -that is, if there is an odd number of digits- the final digit shall be 
#assumed to be 0.

@TOKEN(r'<[a-fA-F0-9'+white_spaces_r+']*>')
def t_HEXSTRING(t):
    t.value =  ''.join([c for c in t.value if c not in white_spaces+"<>"])
    t.value =  (t.value+('0'*(len(t.value)%2))).decode('hex')
    return t

   
#7.3.5      Name Objects
#Beginning with PDF 1.2 a name object is an atomic symbol uniquely
#defined by a sequence of any characters (8-bit values) except null
#(character code 0).
#
#When writing a name in a PDF file, a SOLIDUS (2Fh) (/) shall be used to
#introduce a name. The SOLIDUS is not part of the name but is a prefix
#indicating that what follows is a sequence of characters representing
#the name in the PDF file and shall follow these rules:
#a)  A NUMBER SIGN (23h) (#) in a name shall be written by using its
#    2-digit hexadecimal code (23), preceded     by the NUMBER SIGN.
#b)  Any character in a name that is a regular character (other than
#    NUMBER SIGN) shall be written as itself or by using its 2-digit
#    hexadecimal code, preceded by the NUMBER SIGN.
#c)  Any character that is not a regular character shall be written using
#    its 2-digit hexadecimal code, preceded     by the NUMBER SIGN only.

def t_NAME(t):
    r'/'
    t.lexer.push_state('name')    
    t.lexer.name = ""
    t.lexer.start = t.lexpos

def t_name_HEXCHAR(t):
    r'\#[0-9a-fA-F]{2}'
    #Beginning with PDF 1.2 a name object is an atomic symbol uniquely 
    #defined by a sequence of any characters (8-bit values) except null (character code 0).
    assert t.value != "#00"
    t.lexer.name += t.value[1:].decode('hex')

@TOKEN(r'[^'+white_spaces_r+delimiters_r+']')
def t_name_NAMECHAR(t):
    t.lexer.name += t.value
    
    
@TOKEN(r'['+white_spaces_r+delimiters_r+']')
def t_name_WHITESPACE(t):
    global stream_len
    t.lexer.pop_state()
    t.lexer.lexpos -= 1
    t.lexpos = t.lexer.start
    t.type  = "NAME"
    t.value = t.lexer.name
    t.lexer.name=""
    if t.value == "Length":
        stream_len = None
    return t

def t_name_error(t):
    print "Name error",t.lexer.lexpos, t.lexer.name

#7.3.6 Array Objects
#An array shall be written as a sequence of objects enclosed in [ and ].
#EXAMPLE         [ 549 3.14 false ( Ralph ) /SomeName ]
t_LEFT_SQUARE_BRACKET = r"\["
t_RIGHT_SQUARE_BRACKET = r"\]"

#7.3.7      Dictionary Objects
#A dictionary shall be written as a sequence of key-value pairs 
#enclosed in double angle brackets (<< ... >>)
t_DOUBLE_LESS_THAN_SIGN = r'<<'
t_DOUBLE_GREATER_THAN_SIGN = r'>>'

############################################################################
#7.3.8     Stream Objects
#A stream object, like a string object, is a sequence of bytes. A stream 
#shall consist of a dictionary followed by zero or more bytes bracketed between 
#the keywords stream(followed by newline) and endstream

#The keyword stream that follows the stream dictionary shall be followed by an 
#end-of-line marker consisting of either a CARRIAGE RETURN and a LINE FEED or 
#just a LINE FEED, and not by a CARRIAGE RETURN alone.
def t_STREAM_DATA(t):
    r'stream(\r\n|\n)'
    global stream_len
    if stream_len and stream_len > 0:
        found = t.lexer.lexdata.find('endstream',t.lexer.lexpos+stream_len)
    else:
        found = t.lexer.lexdata.find('endstream',t.lexer.lexpos)
    stream_len = None
    
    if found != -1:
        chop = 0

        if t.lexer.lexdata[found-3] == '\r':
            chop = {'\r':1, '\n':2}[t.lexer.lexdata[found-2]]
        elif t.lexer.lexdata[found-2] in ['\n','\r']:
            chop = 1
        else:
            #TODO log errors
            #print "Warning in endstream"
            pass
        t.value = t.lexer.lexdata[t.lexer.lexpos: found -1 - chop]
        t.lexer.lexpos = found + 9
        t.type  = "STREAM_DATA"
    else:
        raise Exception("Error:Parsing:Lexer: COuld not found endstream string.")
    return t

#7.3.9      Null Object
#The null object has a type and value that are unequal to those of any 
#other object. There shall be only one object of type null, denoted by 
#the keyword null. 
t_NULL = r'null'

#7.3.10   Indirect Objects
#Any object in a PDF file may be labelled as an indirect object.The 
#definition of an indirect object in a PDF file shall consist of its 
#object number and generation number(separated by white space), 
#followed by the value of the object bracketed between the keywords 
#obj and endobj.
def t_OBJ(t):
    r'\d+\x20\d+\x20obj' #[0-9]{1,10} [0-9]+ obj'
    t.value = tuple(t.value.split("\x20")[:2])
    return t
t_ENDOBJ = r'endobj'

#The object may be referred to from elsewhere in the file by an indirect
#reference. Such indirect references shall consist of the object number, 
#the generation number, and the keyword R (with white space separating each
#part):
#EXAMPLE     12 0 R
def t_R(t):
    r'\d+\x20\d+\x20R'
    t.value = tuple([int(x,10) for x in t.value.split("\x20")[:2] ])
    return t
    
#7.3.3 Numeric Objects
#PDF provides two types of numeric objects: integer and real. Integer objects
#represent mathematical integers. Real objects represent mathematical real numbers. 
def t_NUMBER(t):
    r'[+-]{0,1}(\d*\.\d+|\d+\.\d*|\d+)' #34.5 -3.62 +123.6 4. -.002 0.0 123 43445 +17 -98 0
    global stream_len 
    if stream_len == None:
        stream_len = int(float(t.value))
    return t


#7.5.2      File Header
#The first line of a PDF file shall be a header consisting of the 5 characters %PDF- 
#followed by a version number of the form 1.N, where N is a digit between 0 and 7.
def t_HEADER(t):
    r'%PDF-1\.[0-7]'
    t.value = t.value[-3:]
    return t
    
#7.5.4     Cross-Reference Table
#Each cross-reference section shall begin with a line containing the keyword
#xref. Following this line shall be one or more cross-reference subsections,
#which may appear in any order.
@TOKEN(r'xref[' + white_spaces_r +']*'+eol)
def t_XREF(t):
    t.lexer.push_state('xref')    
    t.lexer.xref = []
    t.lexer.xref_start = t.lexpos
    
def t_xref_XREFENTRY(t):
    r'\d{10}[ ]\d{5}[ ][nf](\x20\x0D|\x20\x0A|\x0D\x0A)'
    n = t.value.strip().split(" ")
    t.lexer.xref[len(t.lexer.xref)-1][1].append((int(n[0],10), int(n[1],10), n[2]))

#EXAMPLE 1 The following line introduces a subsection containing five objects
#numbered consecutively from 28 to 32.
#          28 5
@TOKEN(r'[0-9]+[ ][0-9]+[' + white_spaces_r +']*'+eol)
def t_xref_SUBXREF(t):
    n = t.value.split(" ")
    t.lexer.xref.append(((int(n[0],10),int(n[1],10)),[]))
    
def t_xref_out(t):
    r'.'
    t.lexer.pop_state()  
    t.type = 'XREF'
    t.value = t.lexer.xref
    t.lexer.lexpos -= 1
    t.lexpos=t.lexer.xref_start
    return t

#TODO: Log, increment a warning counter, or even dismiss the file   
def t_xref_error(t):
    print "XREF Error"
    t.lexer.skip(1)


#7.5.5      File Trailer
#The trailer of a PDF file enables a conforming reader to quickly find the
#cross-reference table and certain special objects. Conforming readers 
#should read a PDF file from its end. The last line of the file shall contain
#only the end-of-file marker, %%EOF. The two preceding lines shall contain, 
#one per line and in order, the keyword startxref and the byte offset in the 
#decoded stream from the beginning of the file to the beginning of the xref 
#keyword in the last cross-reference section. The startxref line shall be 
#preceded by the trailer dictionary, consisting of the keyword trailer followed
#by a series of key-value pairs enclosed in double anglebrackets (<< ... >>).
#Thus, the trailer has the following overall structure:
#       trailer
#           << key1 value1
#                key2 value2
#                ...
#                keyn valuen
#           >>
#       startxref
#       Byte_offset_of_last_cross-reference_section
#       %%EOF
t_TRAILER = r'trailer'

@TOKEN(r'startxref'+ '['+white_spaces_r+']+[0-9]+')
def t_STARTXREF(t):
    t.value = int(t.value[10:],10)
    return t
    
#FYI: Probably trying to fix some ill transmitted pdfs some 
#readers look for this marker in the las 1k bytes of the file
t_EOF = r'%%EOF'

#ignore the comments
def t_ignore_COMMENT(t):
    r'%[^\n\r]*[\n\r]'
    if t.value.startswith("%%EOF"):
        t.type = 'EOF'
        return t

#Damn! A lexing error!!
#TODO: Log, increment a warning counter, or even dismiss the file   
def t_error(t):
    print "ERROR:lexer: MAIN erris (Pos:%d):"%t.lexer.lexpos, t.lexer.lexdata[t.lexer.lexpos:][:10]
    t.lexer.skip(1)

t_ignore = white_spaces

# Build the lexer
lex.lex(optimize=True)
#lex.lex(debug=True)
import zlib
if __name__ == '__main__':
    try:
        import psyco
        psyco.full()
    except:
        pass
        
    bytes = 0
    files = 0
    for filename in sys.argv[1:]:
        try:
            s = file(filename,"r").read()
            files += 1
            bytes += len(s)
            # Give the lexer some input
            lex.input(s)
            print filename
            # Tokenize
            while True:
                tok = lex.token()
                if not tok: break      # No more input
                
                print tok
            
        except:
            print e, filename
            print dir(e)