Guest User

opaf PDF stat maker

a guest
Aug 25th, 2010
338
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. ####################################################################
  2. ## felipe.andres.manzano@gmail.com  http://feliam.wordpress.com/  ##
  3. ## twitter.com/feliam        http://www.linkedin.com/in/fmanzano  ##
  4. ####################################################################
  5. '''
  6. = Opaf! It's an Open PDF Analysis Framework! =
  7.  
  8. python stats.py file1.pdf .... fileN.pdf
  9.  
  10. Total number of parsed files: 1
  11. Total number of parsed bytes: 687662 [avg:687662.0]
  12. Total number of indirect objects: 788 [avg:788.0]
  13. Total number of streams: 89 [avg:89.0]
  14. Total number of filtered streams: 65 [avg:65.0]
  15. Total number of compressed objects: 684 [avg:684.0]
  16. Object Filter frequencies: {'FlateDecode': 58, 'DCTDecode': 7}
  17. Object Type frequencies: {'XObject': 8, 'XRef': 1, 'Group': 21, 'Encoding': 7, 'Pages': 1, 'Mask': 8, 'Catalog': 1, 'StructTreeRoot': 1, 'ExtGState': 13, 'FontDescriptor': 11, 'ObjStm': 7, 'Font': 15, 'Page': 6, 'Metadata': 17}
  18.  
  19.  
  20. '''
  21. import sys
  22. from opaflib import *
  23.  
  24. if __name__ == '__main__':
  25.     types = {}
  26.     filters = {}
  27.     bytes, iobjects, streams, fstreams, cobjects = [], [], [], [], []
  28.  
  29.     for filename in sys.argv[1:]:
  30.         try:
  31.             #read the pdf file
  32.             pdf = file(filename,'rb').read()
  33.            
  34.             #parse it to xml
  35.             xml_pdf = normalParser(pdf)
  36.     #        xml_pdf = bruteParser(pdf)
  37.    
  38.             #find, expand and parse every ObjStm
  39.             for objstm in xml_pdf.xpath(
  40.      '//*[starts-with(local-name(),"indirect_object")]/dictionary/dictionary_entry/name[@payload=enc("Type")]/following-sibling::*[position()=1 and @payload=enc("ObjStm")]/../../..'):        
  41.                 expand(objstm)
  42.                 expandObjStm(objstm)
  43.  
  44.             #count /Filters
  45.             for xml_fi in xml_pdf.xpath('//indirect_object_stream/dictionary/dictionary_entry/name[@payload=enc("Filter")]/../*[position()=2]'):
  46.                 if xml_fi.tag == 'array':
  47.                     fis = [payload(x) for x in xml_fi]
  48.                 elif xml_fi.tag == 'name':
  49.                     fis = [payload(xml_fi)]
  50.                 else:
  51.                     fis = []
  52.                     print "Error parsing filter name at %s in file %s"%(filename,xml_fi.get('lexstart'))
  53.                 for fi in fis:    
  54.                     filters[fi] = filters.get(fi,0)+1
  55.  
  56.             #Count Object Types
  57.             for ty in [payload(x) for x in xml_pdf.xpath('//*[starts-with(local-name(),"indirect_object")]/dictionary/dictionary_entry/name[@payload=enc("Type")]/following-sibling::*[1]')]:
  58.                 types[ty] = types.get(ty,0)+1
  59.  
  60.             #Other random counters...
  61.             iobjects.append(len(xml_pdf.xpath('//indirect_object')))
  62.             streams.append(len(xml_pdf.xpath('//indirect_object_stream')))
  63.             fstreams.append(len(xml_pdf.xpath('//indirect_object_stream/dictionary/dictionary_entry/name[@payload=enc("Filter")]/../../..')))
  64.             cobjects.append(len(xml_pdf.xpath('//indirect_object_stream//indirect_object')))
  65.             bytes.append(len(pdf))
  66.             print sum(bytes)
  67.             if sum(bytes) >100000000:
  68.                 break
  69.         except Exception,e:
  70.             print e
  71.             print ("Sorry couldn't parse file %s with sequential parser."%filename)        
  72.     def stats(L):
  73.         total = sum(L)
  74.         average = float(sum(L)) / len(L)
  75.         return total,average
  76.    
  77.     #Print statistics to stdout.
  78.     print "Total number of parsed files: %s"%len(bytes)
  79.    
  80.     if len(bytes) != 0 :
  81.         print "Total number of parsed bytes: %s [avg:%s]"%stats(bytes)
  82.         #print ("Total time: %s [avg:%s]"%stats(times))
  83.  
  84.         print "Total number of indirect objects: %s [avg:%s]"%stats(iobjects)
  85.         print "Total number of streams: %s [avg:%s]"%stats(streams)
  86.         print "Total number of filtered streams: %s [avg:%s]"%stats(fstreams)
  87.         print "Total number of compressed objects: %s [avg:%s]"%stats(cobjects)
  88.  
  89.         print "Object Filter frequencies: %s"%repr(filters)
  90.         print "Object Type frequencies: %s"%repr(types)
  91.  
  92.     print
RAW Paste Data