#!/usr/bin/env python3
import xml.etree.ElementTree as xml
import sys
class Tagdata:
def __init__(self):
self.attrs = {}
self.elems = {}
self.content = set()
self.observed = 1
def note_observed(self):
self.observed += 1
def add_attr(self, k, v):
(optional, samples) = self.attrs.get(k, (False, set()))
self.attrs[k] = (optional, samples | set([v]))
def add_elem(self, elem):
self.elems[elem.tag] = self.elems.get(elem.tag, False)
def add_content(self, stuff):
self.content |= set([stuff])
def make_attrs_optional(self, attrs):
missing = set(self.attrs) - set(attrs)
for m in missing:
(_, samples) = self.attrs[m]
self.attrs[m] = (True, samples)
def make_elems_optional(self, elems):
missing = set(self.elems) - set(elems)
for m in missing:
self.elems[m] = True
def __repr__(self):
return 'Tagdata(attrs={}, elems={})'.format(
self.attrs, self.elems)
class Traverse:
def __init__(self):
self.cache = {}
def add_first_element(self, elem):
self.cache[elem.tag] = tag = Tagdata()
for k, v in elem.attrib.items():
tag.add_attr(k, v)
for child in elem:
tag.add_elem(child)
self.add_element(child)
if elem.text and elem.text.strip():
tag.add_content(elem.text.strip())
def add_subsequent_element(self, elem):
tag = self.cache[elem.tag]
tag.note_observed()
for k, v in elem.attrib.items():
tag.add_attr(k, v)
for child in elem:
tag.add_elem(elem)
self.add_element(child)
if elem.text and elem.text.strip():
tag.add_content(elem.text.strip())
tag.make_attrs_optional(elem.attrib.keys())
tag.make_elems_optional([e.tag for e in elem])
def add_element(self, elem):
if elem.tag not in self.cache:
self.add_first_element(elem)
else:
self.add_subsequent_element(elem)
def sample_seq(seq):
strs = []
for s in seq:
str = repr(s)
if len(str) > 24:
strs.append(str[:20] + '...' + str[0])
else:
strs.append(str)
return ', '.join(strs[:5])
def optional_text(is_optional):
if is_optional:
return 'sometimes'
else:
return 'always'
def main(paths):
t = Traverse()
for p in paths:
t.add_element(xml.parse(p).getroot())
for k, v in t.cache.items():
print('tag {0} (observed {1} sample{2})'.format(k, v.observed, 's' if v.observed > 1 else '' ))
if v.attrs:
for (attr, (optional, sample)) in v.attrs.items():
print('| - attr {0} ({1})'.format(attr, optional_text(optional)))
print('| sample values: {0}'.format(sample_seq(sample)))
if v.elems:
for (elem, optional) in v.elems.items():
print('| - child {0} ({1})'.format(elem, optional_text(optional)))
if v.content:
print('| - has textual content')
print('| sample content: {0}'.format(sample_seq(v.content)))
print()
if __name__ == '__main__':
if sys.argv[1:]:
main(sys.argv[1:])
else:
sys.stderr.write(
'usage: {0} [file.xml] ...\n'.format(sys.argv[0]))