Advertisement
Tyler_Elric

Untitled

Dec 10th, 2015
167
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from xml.etree.ElementTree import iterparse
  2.  
  3. class StackDict:
  4.     def __init__(self,default_value=None):
  5.         self.path = []
  6.         self.default = default_value or {}
  7.         self.data = self.make_new_node()
  8.         self.current_point = self.data
  9.     def push(self,path):
  10.         self.path.append(path)
  11.         self.current_point = self.data
  12.         exists = True
  13.         for node in self.path:
  14.             new_point = self.current_point['subchildren'].get(node)
  15.             if new_point is None:
  16.                 exists = False
  17.                 print("Creating child",".".join(self.path),node)
  18.                 new_point = self.make_new_node()
  19.                 self.current_point['subchildren'][node] = new_point
  20.             self.current_point = new_point
  21.         return exists
  22.     def make_new_node(self):
  23.         new_node = {}
  24.         new_node['subchildren'] = {}
  25.         for name,val in self.default.items():
  26.             new_node[name] = val() if callable(val) else val
  27.         return new_node
  28.     def pop(self):
  29.         # previous path is implicitly gauranteed to exist.
  30.         self.path.pop()
  31.         self.current_point = self.data
  32.         for node in self.path:
  33.             self.current_point = self.current_point['subchildren'][node]
  34.     def top(self):
  35.         self.path = []
  36.         self.current_point = self.data
  37.  
  38. def print_schema(schema,prefix=' '):
  39.     for node in schema.current_point['subchildren'].keys():
  40.         schema.push(node)
  41.         print(prefix[1:],node,':',sep='')
  42.         print(prefix[1:],schema.current_point['data'].__name__)
  43.         if len(schema.current_point['attributes'])>0:
  44.             print(prefix[1:],"attributes->")
  45.             for attr, kind in schema.current_point['attributes'].items():
  46.                 print(prefix+' ',attr,':',kind.__name__)
  47.         if len(schema.current_point['subchildren']) > 0:
  48.             print(prefix[1:],"children->")
  49.             print_schema(schema,prefix + prefix[0] * 2)
  50.         schema.pop()
  51.  
  52. class NoData:pass
  53.  
  54. def analyze(f,schema):
  55.     block_size = 1024
  56.     def identify_data_type(txt):
  57.         try:
  58.             txt = int(txt)
  59.             return int
  60.         except:
  61.             return NoData if txt is None or (isinstance(txt,str) and len(txt)<1) else str
  62.         return NoData
  63.     for event, el in iterparse(fn,('start','end')):
  64.         if event=='start':
  65.             if schema.push(el.tag):
  66.                 continue
  67.             if schema.current_point['data'] is None or schema.current_point['data'] is NoData:
  68.                 schema.current_point['data'] = identify_data_type(el.text if len(el)<1 else None)
  69.             for attr_name,attr_val in el.items():
  70.                 if not attr_name in schema.current_point['attributes']:
  71.                     schema.current_point['attributes'][attr_name] = identify_data_type(attr_val)
  72.         elif event=='end':
  73.             schema.pop()
  74.     print("Calculated schema:")
  75.     schema.top()
  76.     return schema
  77.  
  78. if __name__=="__main__":
  79.     import json, sys
  80.  
  81.     class TypeEncoder(json.JSONEncoder):
  82.         def default(self,obj):
  83.             if isinstance(obj,type):
  84.                 return {
  85.                     int:'int',
  86.                     float:'double',
  87.                     NoData:None,
  88.                     str:'string'
  89.                 }[obj]
  90.             return super().default(obj)
  91.     schema = StackDict({
  92.         'attributes':dict,
  93.         'data':None
  94.     })
  95.     for fn in sys.argv:
  96.         if fn[fn.rfind(".")+1:].lower()!='xml':
  97.             continue
  98.         try:
  99.             analyze(open(fn),schema)
  100.         except:
  101.             pass
  102.         finally:
  103.             print("Saving schema")
  104.             schema.top()
  105.             print_schema(schema)
  106.             #with open(fn[:fn.rfind(".")]+"-structure.json",'w') as f:
  107.             #    json.dump(schema.data,f,cls=TypeEncoder,indent=2)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement