View difference between Paste ID: MMUVSiYM and mHvHcdjd
SHOW: | | - or go back to the newest paste.
1-
import xml.etree.ElementTree as ET
1+
from xml.etree.ElementTree import iterparse
2
3
class StackDict:
4
    def __init__(self,default_value=None):
5
        self.path = []
6
        self.default = default_value or {}
7
        self.data = self.make_new_node()
8
        self.current_point = self.data
9
    def push(self,path):
10
        self.path.append(path)
11
        self.current_point = self.data
12
        exists = True
13
        for node in self.path:
14
            new_point = self.current_point['subchildren'].get(node)
15
            if new_point is None:
16
                exists = False
17
                print("Creating child",".".join(self.path),node)
18
                new_point = self.make_new_node()
19
                self.current_point['subchildren'][node] = new_point
20
            self.current_point = new_point
21
        return exists
22
    def make_new_node(self):
23
        new_node = {}
24
        new_node['subchildren'] = {}
25
        for name,val in self.default.items():
26
            new_node[name] = val() if callable(val) else val
27
        return new_node
28
    def pop(self):
29
        # previous path is implicitly gauranteed to exist.
30
        self.path.pop()
31
        self.current_point = self.data
32
        for node in self.path:
33
            self.current_point = self.current_point['subchildren'][node]
34
    def top(self):
35
        self.path = []
36
        self.current_point = self.data
37
38
def print_schema(schema,prefix=' '):
39
    for node in schema.current_point['subchildren'].keys():
40
        schema.push(node)
41
        print(prefix[1:],node,':',sep='')
42
        print(prefix[1:],schema.current_point['data'].__name__)
43
        if len(schema.current_point['attributes'])>0:
44
            print(prefix[1:],"attributes->")
45
            for attr, kind in schema.current_point['attributes'].items():
46
                print(prefix+' ',attr,':',kind.__name__)
47
        if len(schema.current_point['subchildren']) > 0:
48
            print(prefix[1:],"children->")
49
            print_schema(schema,prefix + prefix[0] * 2)
50-
def analyze(f):
50+
51-
    parser = ET.XMLPullParser(['start','end'])
51+
52
class NoData:pass
53
54
def analyze(f,schema):
55
    block_size = 1024
56
    def identify_data_type(txt):
57
        try:
58
            txt = int(txt)
59
            return int
60
        except:
61
            return NoData if txt is None or (isinstance(txt,str) and len(txt)<1) else str
62
        return NoData
63
    for event, el in iterparse(fn,('start','end')):
64-
    with f:
64+
        if event=='start':
65-
        while True:
65+
            if schema.push(el.tag):
66-
            data = f.read(block_size)
66+
                continue
67-
            parser.feed(data)
67+
            if schema.current_point['data'] is None or schema.current_point['data'] is NoData:
68-
            for event, el in parser.read_events():
68+
                schema.current_point['data'] = identify_data_type(el.text if len(el)<1 else None)
69-
                if event=='start':
69+
            for attr_name,attr_val in el.items():
70-
                    schema.push(el.tag)
70+
                if not attr_name in schema.current_point['attributes']:
71-
                    if schema.current_point['data'] is None or schema.current_point['data'] is NoData:
71+
                    schema.current_point['attributes'][attr_name] = identify_data_type(attr_val)
72-
                        schema.current_point['data'] = identify_data_type(el.text if len(el)<1 else None)
72+
        elif event=='end':
73-
                    for attr_name,attr_val in el.items():
73+
            schema.pop()
74-
                        if not attr_name in schema.current_point['attributes']:
74+
75-
                            schema.current_point['attributes'][attr_name] = identify_data_type(attr_val)
75+
76-
                elif event=='end':
76+
77-
                    schema.pop()
77+
78-
            if len(data) < block_size:
78+
79-
                break
79+
    import json, sys
80
81
    class TypeEncoder(json.JSONEncoder):
82
        def default(self,obj):
83
            if isinstance(obj,type):
84
                return {
85-
    import json
85+
86
                    float:'double',
87
                    NoData:None,
88
                    str:'string'
89
                }[obj]
90
            return super().default(obj)
91
    schema = StackDict({
92
        'attributes':dict,
93
        'data':None
94
    })
95
    for fn in sys.argv:
96
        if fn[fn.rfind(".")+1:].lower()!='xml':
97
            continue
98-
    schema = analyze(open("data.xml"))
98+
99-
    with open("structure.json",'w') as f:
99+
            analyze(open(fn),schema)
100-
        json.dump(schema.data,f,cls=TypeEncoder,indent=2)
100+
101
            pass
102
        finally:
103
            print("Saving schema")
104
            schema.top()
105
            print_schema(schema)
106
            #with open(fn[:fn.rfind(".")]+"-structure.json",'w') as f:
107
            #    json.dump(schema.data,f,cls=TypeEncoder,indent=2)