gdritter repos infer-xml / master infer-xml.py
master

Tree @master (Download .tar.gz)

infer-xml.py @masterraw · history · blame

#!/usr/bin/env python3

import xml.etree.ElementTree as xml
import sys

class Tagdata:
    def __init__(self):
        self.attrs = {}
        self.elems = {}
        self.content = set()
        self.observed = 1

    def note_observed(self):
        self.observed += 1

    def add_attr(self, k, v):
        (optional, samples) = self.attrs.get(k, (False, set()))
        self.attrs[k] = (optional, samples | set([v]))

    def add_elem(self, elem):
        self.elems[elem.tag] = self.elems.get(elem.tag, False)

    def add_content(self, stuff):
        self.content |= set([stuff])

    def make_attrs_optional(self, attrs):
        missing = set(self.attrs) - set(attrs)
        for m in missing:
            (_, samples) = self.attrs[m]
            self.attrs[m] = (True, samples)

    def make_elems_optional(self, elems):
        missing = set(self.elems) - set(elems)
        for m in missing:
            self.elems[m] = True

    def __repr__(self):
        return 'Tagdata(attrs={}, elems={})'.format(
            self.attrs, self.elems)

class Traverse:
    def __init__(self):
        self.cache = {}

    def add_first_element(self, elem):
        self.cache[elem.tag] = tag = Tagdata()

        for k, v in elem.attrib.items():
            tag.add_attr(k, v)

        for child in elem:
            tag.add_elem(child)
            self.add_element(child)

        if elem.text and elem.text.strip():
            tag.add_content(elem.text.strip())

    def add_subsequent_element(self, elem):
        tag = self.cache[elem.tag]
        tag.note_observed()

        for k, v in elem.attrib.items():
            tag.add_attr(k, v)

        for child in elem:
            tag.add_elem(elem)
            self.add_element(child)

        if elem.text and elem.text.strip():
            tag.add_content(elem.text.strip())

        tag.make_attrs_optional(elem.attrib.keys())
        tag.make_elems_optional([e.tag for e in elem])

    def add_element(self, elem):
        if elem.tag not in self.cache:
            self.add_first_element(elem)
        else:
            self.add_subsequent_element(elem)


def sample_seq(seq):
    strs = []
    for s in seq:
        str = repr(s)
        if len(str) > 24:
            strs.append(str[:20] + '...' + str[0])
        else:
            strs.append(str)
    return ', '.join(strs[:5])


def optional_text(is_optional):
    if is_optional:
        return 'sometimes'
    else:
        return 'always'


def main(paths):
    t = Traverse()
    for p in paths:
        t.add_element(xml.parse(p).getroot())

    for k, v in t.cache.items():
        print('tag {0}  (observed {1} sample{2})'.format(k, v.observed, 's' if v.observed > 1 else '' ))

        if v.attrs:
            for (attr, (optional, sample)) in v.attrs.items():
                print('| - attr {0} ({1})'.format(attr, optional_text(optional)))
                print('|   sample values: {0}'.format(sample_seq(sample)))

        if v.elems:
            for (elem, optional) in v.elems.items():
                print('| - child {0} ({1})'.format(elem, optional_text(optional)))

        if v.content:
            print('| - has textual content')
            print('|   sample content: {0}'.format(sample_seq(v.content)))

        print()

if __name__ == '__main__':
    if sys.argv[1:]:
        main(sys.argv[1:])
    else:
        sys.stderr.write(
            'usage: {0} [file.xml] ...\n'.format(sys.argv[0]))