####################################################################
## felipe.andres.manzano@gmail.com http://feliam.wordpress.com/ ##
## twitter.com/feliam http://www.linkedin.com/in/fmanzano ##
####################################################################
'''
= Opaf! It's an Open PDF Analysis Framework! =
python stats.py file1.pdf .... fileN.pdf
Total number of parsed files: 1
Total number of parsed bytes: 687662 [avg:687662.0]
Total number of indirect objects: 788 [avg:788.0]
Total number of streams: 89 [avg:89.0]
Total number of filtered streams: 65 [avg:65.0]
Total number of compressed objects: 684 [avg:684.0]
Object Filter frequencies: {'FlateDecode': 58, 'DCTDecode': 7}
Object Type frequencies: {'XObject': 8, 'XRef': 1, 'Group': 21, 'Encoding': 7, 'Pages': 1, 'Mask': 8, 'Catalog': 1, 'StructTreeRoot': 1, 'ExtGState': 13, 'FontDescriptor': 11, 'ObjStm': 7, 'Font': 15, 'Page': 6, 'Metadata': 17}
'''
import sys
from opaflib import *
if __name__ == '__main__':
types = {}
filters = {}
bytes, iobjects, streams, fstreams, cobjects = [], [], [], [], []
for filename in sys.argv[1:]:
try:
#read the pdf file
pdf = file(filename,'rb').read()
#parse it to xml
xml_pdf = normalParser(pdf)
# xml_pdf = bruteParser(pdf)
#find, expand and parse every ObjStm
for objstm in xml_pdf.xpath(
'//*[starts-with(local-name(),"indirect_object")]/dictionary/dictionary_entry/name[@payload=enc("Type")]/following-sibling::*[position()=1 and @payload=enc("ObjStm")]/../../..'):
expand(objstm)
expandObjStm(objstm)
#count /Filters
for xml_fi in xml_pdf.xpath('//indirect_object_stream/dictionary/dictionary_entry/name[@payload=enc("Filter")]/../*[position()=2]'):
if xml_fi.tag == 'array':
fis = [payload(x) for x in xml_fi]
elif xml_fi.tag == 'name':
fis = [payload(xml_fi)]
else:
fis = []
print "Error parsing filter name at %s in file %s"%(filename,xml_fi.get('lexstart'))
for fi in fis:
filters[fi] = filters.get(fi,0)+1
#Count Object Types
for ty in [payload(x) for x in xml_pdf.xpath('//*[starts-with(local-name(),"indirect_object")]/dictionary/dictionary_entry/name[@payload=enc("Type")]/following-sibling::*[1]')]:
types[ty] = types.get(ty,0)+1
#Other random counters...
iobjects.append(len(xml_pdf.xpath('//indirect_object')))
streams.append(len(xml_pdf.xpath('//indirect_object_stream')))
fstreams.append(len(xml_pdf.xpath('//indirect_object_stream/dictionary/dictionary_entry/name[@payload=enc("Filter")]/../../..')))
cobjects.append(len(xml_pdf.xpath('//indirect_object_stream//indirect_object')))
bytes.append(len(pdf))
print sum(bytes)
if sum(bytes) >100000000:
break
except Exception,e:
print e
print ("Sorry couldn't parse file %s with sequential parser."%filename)
def stats(L):
total = sum(L)
average = float(sum(L)) / len(L)
return total,average
#Print statistics to stdout.
print "Total number of parsed files: %s"%len(bytes)
if len(bytes) != 0 :
print "Total number of parsed bytes: %s [avg:%s]"%stats(bytes)
#print ("Total time: %s [avg:%s]"%stats(times))
print "Total number of indirect objects: %s [avg:%s]"%stats(iobjects)
print "Total number of streams: %s [avg:%s]"%stats(streams)
print "Total number of filtered streams: %s [avg:%s]"%stats(fstreams)
print "Total number of compressed objects: %s [avg:%s]"%stats(cobjects)
print "Object Filter frequencies: %s"%repr(filters)
print "Object Type frequencies: %s"%repr(types)
print