SHOW:
|
|
- or go back to the newest paste.
1 | - | import xml.etree.ElementTree as ET |
1 | + | from xml.etree.ElementTree import iterparse |
2 | ||
3 | class StackDict: | |
4 | def __init__(self,default_value=None): | |
5 | self.path = [] | |
6 | self.default = default_value or {} | |
7 | self.data = self.make_new_node() | |
8 | self.current_point = self.data | |
9 | def push(self,path): | |
10 | self.path.append(path) | |
11 | self.current_point = self.data | |
12 | exists = True | |
13 | for node in self.path: | |
14 | new_point = self.current_point['subchildren'].get(node) | |
15 | if new_point is None: | |
16 | exists = False | |
17 | print("Creating child",".".join(self.path),node) | |
18 | new_point = self.make_new_node() | |
19 | self.current_point['subchildren'][node] = new_point | |
20 | self.current_point = new_point | |
21 | return exists | |
22 | def make_new_node(self): | |
23 | new_node = {} | |
24 | new_node['subchildren'] = {} | |
25 | for name,val in self.default.items(): | |
26 | new_node[name] = val() if callable(val) else val | |
27 | return new_node | |
28 | def pop(self): | |
29 | # previous path is implicitly gauranteed to exist. | |
30 | self.path.pop() | |
31 | self.current_point = self.data | |
32 | for node in self.path: | |
33 | self.current_point = self.current_point['subchildren'][node] | |
34 | def top(self): | |
35 | self.path = [] | |
36 | self.current_point = self.data | |
37 | ||
38 | def print_schema(schema,prefix=' '): | |
39 | for node in schema.current_point['subchildren'].keys(): | |
40 | schema.push(node) | |
41 | print(prefix[1:],node,':',sep='') | |
42 | print(prefix[1:],schema.current_point['data'].__name__) | |
43 | if len(schema.current_point['attributes'])>0: | |
44 | print(prefix[1:],"attributes->") | |
45 | for attr, kind in schema.current_point['attributes'].items(): | |
46 | print(prefix+' ',attr,':',kind.__name__) | |
47 | if len(schema.current_point['subchildren']) > 0: | |
48 | print(prefix[1:],"children->") | |
49 | print_schema(schema,prefix + prefix[0] * 2) | |
50 | - | def analyze(f): |
50 | + | |
51 | - | parser = ET.XMLPullParser(['start','end']) |
51 | + | |
52 | class NoData:pass | |
53 | ||
54 | def analyze(f,schema): | |
55 | block_size = 1024 | |
56 | def identify_data_type(txt): | |
57 | try: | |
58 | txt = int(txt) | |
59 | return int | |
60 | except: | |
61 | return NoData if txt is None or (isinstance(txt,str) and len(txt)<1) else str | |
62 | return NoData | |
63 | for event, el in iterparse(fn,('start','end')): | |
64 | - | with f: |
64 | + | if event=='start': |
65 | - | while True: |
65 | + | if schema.push(el.tag): |
66 | - | data = f.read(block_size) |
66 | + | continue |
67 | - | parser.feed(data) |
67 | + | if schema.current_point['data'] is None or schema.current_point['data'] is NoData: |
68 | - | for event, el in parser.read_events(): |
68 | + | schema.current_point['data'] = identify_data_type(el.text if len(el)<1 else None) |
69 | - | if event=='start': |
69 | + | for attr_name,attr_val in el.items(): |
70 | - | schema.push(el.tag) |
70 | + | if not attr_name in schema.current_point['attributes']: |
71 | - | if schema.current_point['data'] is None or schema.current_point['data'] is NoData: |
71 | + | schema.current_point['attributes'][attr_name] = identify_data_type(attr_val) |
72 | - | schema.current_point['data'] = identify_data_type(el.text if len(el)<1 else None) |
72 | + | elif event=='end': |
73 | - | for attr_name,attr_val in el.items(): |
73 | + | schema.pop() |
74 | - | if not attr_name in schema.current_point['attributes']: |
74 | + | |
75 | - | schema.current_point['attributes'][attr_name] = identify_data_type(attr_val) |
75 | + | |
76 | - | elif event=='end': |
76 | + | |
77 | - | schema.pop() |
77 | + | |
78 | - | if len(data) < block_size: |
78 | + | |
79 | - | break |
79 | + | import json, sys |
80 | ||
81 | class TypeEncoder(json.JSONEncoder): | |
82 | def default(self,obj): | |
83 | if isinstance(obj,type): | |
84 | return { | |
85 | - | import json |
85 | + | |
86 | float:'double', | |
87 | NoData:None, | |
88 | str:'string' | |
89 | }[obj] | |
90 | return super().default(obj) | |
91 | schema = StackDict({ | |
92 | 'attributes':dict, | |
93 | 'data':None | |
94 | }) | |
95 | for fn in sys.argv: | |
96 | if fn[fn.rfind(".")+1:].lower()!='xml': | |
97 | continue | |
98 | - | schema = analyze(open("data.xml")) |
98 | + | |
99 | - | with open("structure.json",'w') as f: |
99 | + | analyze(open(fn),schema) |
100 | - | json.dump(schema.data,f,cls=TypeEncoder,indent=2) |
100 | + | |
101 | pass | |
102 | finally: | |
103 | print("Saving schema") | |
104 | schema.top() | |
105 | print_schema(schema) | |
106 | #with open(fn[:fn.rfind(".")]+"-structure.json",'w') as f: | |
107 | # json.dump(schema.data,f,cls=TypeEncoder,indent=2) |