View difference between Paste ID: <a href="/MMUVSiYM">MMUVSiYM</a> and <a href="/mHvHcdjd">mHvHcdjd</a>

import xml.etree.ElementTree as ET
1	-	import xml.etree.ElementTree as ET
1	+	from xml.etree.ElementTree import iterparse
2
3		class StackDict:
4		def __init__(self,default_value=None):
5		self.path = []
6		self.default = default_value or {}
7		self.data = self.make_new_node()
8		self.current_point = self.data
9		def push(self,path):
10		self.path.append(path)
11		self.current_point = self.data
12		exists = True
13		for node in self.path:
14		new_point = self.current_point['subchildren'].get(node)
15		if new_point is None:
16		exists = False
17		print("Creating child",".".join(self.path),node)
18		new_point = self.make_new_node()
19		self.current_point['subchildren'][node] = new_point
20		self.current_point = new_point
21		return exists
22		def make_new_node(self):
23		new_node = {}
24		new_node['subchildren'] = {}
25		for name,val in self.default.items():
26		new_node[name] = val() if callable(val) else val
27		return new_node
28		def pop(self):
29		# previous path is implicitly gauranteed to exist.
30		self.path.pop()
31		self.current_point = self.data
32		for node in self.path:
33		self.current_point = self.current_point['subchildren'][node]
34		def top(self):
35		self.path = []
36		self.current_point = self.data
37
38		def print_schema(schema,prefix=' '):
39		for node in schema.current_point['subchildren'].keys():
40		schema.push(node)
41		print(prefix[1:],node,':',sep='')
42		print(prefix[1:],schema.current_point['data'].__name__)
43		if len(schema.current_point['attributes'])>0:
44		print(prefix[1:],"attributes->")
45		for attr, kind in schema.current_point['attributes'].items():
46		print(prefix+' ',attr,':',kind.__name__)
47		if len(schema.current_point['subchildren']) > 0:
48		print(prefix[1:],"children->")
49		print_schema(schema,prefix + prefix[0] * 2)
50	-	def analyze(f):
50	+
51	-	parser = ET.XMLPullParser(['start','end'])
51	+
52		class NoData:pass
53
54		def analyze(f,schema):
55		block_size = 1024
56		def identify_data_type(txt):
57		try:
58		txt = int(txt)
59		return int
60		except:
61		return NoData if txt is None or (isinstance(txt,str) and len(txt)<1) else str
62		return NoData
63		for event, el in iterparse(fn,('start','end')):
64	-	with f:
64	+	if event=='start':
65	-	while True:
65	+	if schema.push(el.tag):
66	-	data = f.read(block_size)
66	+	continue
67	-	parser.feed(data)
67	+	if schema.current_point['data'] is None or schema.current_point['data'] is NoData:
68	-	for event, el in parser.read_events():
68	+	schema.current_point['data'] = identify_data_type(el.text if len(el)<1 else None)
69	-	if event=='start':
69	+	for attr_name,attr_val in el.items():
70	-	schema.push(el.tag)
70	+	if not attr_name in schema.current_point['attributes']:
71	-	if schema.current_point['data'] is None or schema.current_point['data'] is NoData:
71	+	schema.current_point['attributes'][attr_name] = identify_data_type(attr_val)
72	-	schema.current_point['data'] = identify_data_type(el.text if len(el)<1 else None)
72	+	elif event=='end':
73	-	for attr_name,attr_val in el.items():
73	+	schema.pop()
74	-	if not attr_name in schema.current_point['attributes']:
74	+
75	-	schema.current_point['attributes'][attr_name] = identify_data_type(attr_val)
75	+
76	-	elif event=='end':
76	+
77	-	schema.pop()
77	+
78	-	if len(data) < block_size:
78	+
79	-	break
79	+	import json, sys
80
81		class TypeEncoder(json.JSONEncoder):
82		def default(self,obj):
83		if isinstance(obj,type):
84		return {
85	-	import json
85	+
86		float:'double',
87		NoData:None,
88		str:'string'
89		}[obj]
90		return super().default(obj)
91		schema = StackDict({
92		'attributes':dict,
93		'data':None
94		})
95		for fn in sys.argv:
96		if fn[fn.rfind(".")+1:].lower()!='xml':
97		continue
98	-	schema = analyze(open("data.xml"))
98	+
99	-	with open("structure.json",'w') as f:
99	+	analyze(open(fn),schema)
100	-	json.dump(schema.data,f,cls=TypeEncoder,indent=2)
100	+
101		pass
102		finally:
103		print("Saving schema")
104		schema.top()
105		print_schema(schema)
106		#with open(fn[:fn.rfind(".")]+"-structure.json",'w') as f:
107		# json.dump(schema.data,f,cls=TypeEncoder,indent=2)