gdritter repos infer-xml / 6064312
basic script for inferring info about XML structures Getty Ritter 5 years ago
1 changed file(s) with 92 addition(s) and 0 deletion(s). Collapse all Expand all
1 #!/usr/bin/env python3
2
3 import xml.etree.ElementTree as xml
4 import sys
5
6 class Tagdata:
7 def __init__(self):
8 self.attrs = {}
9 self.elems = {}
10 self.content = set()
11
12 def add_attr(self, k, v):
13 (optional, samples) = self.attrs.get(k, (False, set()))
14 self.attrs[k] = (optional, samples | set([v]))
15
16 def add_elem(self, elem):
17 self.elems[elem.tag] = self.elems.get(elem.tag, False)
18
19 def add_content(self, stuff):
20 self.content |= set(stuff)
21
22 def make_attrs_optional(self, attrs):
23 missing = set(self.attrs) - set(attrs)
24 for m in missing:
25 (_, samples) = self.attrs[m]
26 self.attrs[m] = (True, samples)
27
28 def make_elems_optional(self, elems):
29 missing = set(self.elems) - set(elems)
30 for m in missing:
31 self.elems[m] = True
32
33 def __repr__(self):
34 return 'Tagdata(attrs={}, elems={})'.format(
35 self.attrs, self.elems)
36
37 class Traverse:
38 def __init__(self):
39 self.cache = {}
40
41 def add_first_element(self, elem):
42 self.cache[elem.tag] = tag = Tagdata()
43
44 for k, v in elem.attrib.items():
45 tag.add_attr(k, v)
46
47 for child in elem:
48 tag.add_elem(child)
49 self.add_element(child)
50
51 def add_subsequent_element(self, elem):
52 tag = self.cache[elem.tag]
53
54 for k, v in elem.attrib.items():
55 tag.add_attr(k, v)
56
57 for child in elem:
58 tag.add_elem(elem)
59 self.add_element(child)
60
61 tag.make_attrs_optional(elem.attrib.keys())
62 tag.make_elems_optional([e.tag for e in elem])
63
64 def add_element(self, elem):
65 if elem.tag not in self.cache:
66 self.add_first_element(elem)
67 else:
68 self.add_subsequent_element(elem)
69
70
71 def main(path):
72 t = Traverse()
73 t.add_element(xml.parse(path).getroot())
74 for k, v in t.cache.items():
75 print('tag {0}'.format(k))
76 if v.attrs:
77 for (attr, (optional, sample)) in v.attrs.items():
78 print(' - attr {0} ({1})'.format(
79 attr, 'optional' if optional else 'mandatory'
80 ))
81 if v.elems:
82 for (elem, optional) in v.elems.items():
83 print(' - child {0} ({1})'.format(
84 elem, 'optional' if optional else 'mandatory'
85 ))
86
87 if __name__ == '__main__':
88 if sys.argv[1:]:
89 main(sys.argv[1])
90 else:
91 sys.stderr.write(
92 'usage: {0} [file.xml]\n'.format(sys.argv[0]))