basic script for inferring info about XML structures
Getty Ritter
6 years ago
| 1 | #!/usr/bin/env python3 | |
| 2 | ||
| 3 | import xml.etree.ElementTree as xml | |
| 4 | import sys | |
| 5 | ||
| 6 | class Tagdata: | |
| 7 | def __init__(self): | |
| 8 | self.attrs = {} | |
| 9 | self.elems = {} | |
| 10 | self.content = set() | |
| 11 | ||
| 12 | def add_attr(self, k, v): | |
| 13 | (optional, samples) = self.attrs.get(k, (False, set())) | |
| 14 | self.attrs[k] = (optional, samples | set([v])) | |
| 15 | ||
| 16 | def add_elem(self, elem): | |
| 17 | self.elems[elem.tag] = self.elems.get(elem.tag, False) | |
| 18 | ||
| 19 | def add_content(self, stuff): | |
| 20 | self.content |= set(stuff) | |
| 21 | ||
| 22 | def make_attrs_optional(self, attrs): | |
| 23 | missing = set(self.attrs) - set(attrs) | |
| 24 | for m in missing: | |
| 25 | (_, samples) = self.attrs[m] | |
| 26 | self.attrs[m] = (True, samples) | |
| 27 | ||
| 28 | def make_elems_optional(self, elems): | |
| 29 | missing = set(self.elems) - set(elems) | |
| 30 | for m in missing: | |
| 31 | self.elems[m] = True | |
| 32 | ||
| 33 | def __repr__(self): | |
| 34 | return 'Tagdata(attrs={}, elems={})'.format( | |
| 35 | self.attrs, self.elems) | |
| 36 | ||
| 37 | class Traverse: | |
| 38 | def __init__(self): | |
| 39 | self.cache = {} | |
| 40 | ||
| 41 | def add_first_element(self, elem): | |
| 42 | self.cache[elem.tag] = tag = Tagdata() | |
| 43 | ||
| 44 | for k, v in elem.attrib.items(): | |
| 45 | tag.add_attr(k, v) | |
| 46 | ||
| 47 | for child in elem: | |
| 48 | tag.add_elem(child) | |
| 49 | self.add_element(child) | |
| 50 | ||
| 51 | def add_subsequent_element(self, elem): | |
| 52 | tag = self.cache[elem.tag] | |
| 53 | ||
| 54 | for k, v in elem.attrib.items(): | |
| 55 | tag.add_attr(k, v) | |
| 56 | ||
| 57 | for child in elem: | |
| 58 | tag.add_elem(elem) | |
| 59 | self.add_element(child) | |
| 60 | ||
| 61 | tag.make_attrs_optional(elem.attrib.keys()) | |
| 62 | tag.make_elems_optional([e.tag for e in elem]) | |
| 63 | ||
| 64 | def add_element(self, elem): | |
| 65 | if elem.tag not in self.cache: | |
| 66 | self.add_first_element(elem) | |
| 67 | else: | |
| 68 | self.add_subsequent_element(elem) | |
| 69 | ||
| 70 | ||
| 71 | def main(path): | |
| 72 | t = Traverse() | |
| 73 | t.add_element(xml.parse(path).getroot()) | |
| 74 | for k, v in t.cache.items(): | |
| 75 | print('tag {0}'.format(k)) | |
| 76 | if v.attrs: | |
| 77 | for (attr, (optional, sample)) in v.attrs.items(): | |
| 78 | print(' - attr {0} ({1})'.format( | |
| 79 | attr, 'optional' if optional else 'mandatory' | |
| 80 | )) | |
| 81 | if v.elems: | |
| 82 | for (elem, optional) in v.elems.items(): | |
| 83 | print(' - child {0} ({1})'.format( | |
| 84 | elem, 'optional' if optional else 'mandatory' | |
| 85 | )) | |
| 86 | ||
| 87 | if __name__ == '__main__': | |
| 88 | if sys.argv[1:]: | |
| 89 | main(sys.argv[1]) | |
| 90 | else: | |
| 91 | sys.stderr.write( | |
| 92 | 'usage: {0} [file.xml]\n'.format(sys.argv[0])) |