#!/usr/bin/env python3 import sys from lxml import etree def escape_text(text): return text.encode('unicode_escape').decode("utf-8") def split_qname(name): if name[0] == '{': return name[1:].split('}') else: return [None, name] def print_ind(depth, *args, **kwargs): indent = ' ' * depth indent = indent[:-1] print(indent, *args, **kwargs) def print_node(node, depth): if node.tag is etree.Comment: print_ind(depth, '- Comment: "{}"'.format(escape_text(node.text))) if node.tail: print_ind(depth, '- Text: "{}"'.format(escape_text(node.tail))) return if node.tag is etree.PI: print_ind(depth, '- PI:') print_ind(depth + 2, 'target: "{}"'.format(node.target)) print_ind(depth + 2, 'value: "{}"'.format(escape_text(node.text))) if node.tail: print_ind(depth, '- Text: "{}"'.format(escape_text(node.tail))) return print_ind(depth, '- Element:') if node.tag[0] == '{': uri, tag = split_qname(node.tag) print_ind(depth + 2, 'tag_name: {}@{}'.format(tag, uri)) else: print_ind(depth + 2, 'tag_name:', node.tag) if node.attrib: print_ind(depth + 2, 'attributes:') attrs = [] for name, value in node.attrib.items(): uri, tag = split_qname(name) if uri: attrs.append([tag + '@' + uri, value]) else: attrs.append([tag, value]) attrs = sorted(attrs, key=lambda x: x[0]) for name, value in attrs: print_ind(depth + 3, '{}: "{}"'.format(name, escape_text(value))) if node.nsmap: print_ind(depth + 2, 'namespaces:') ns_list = [] for name, value in node.nsmap.items(): if not name and not value: ns_list.append(['None', '""']) elif not name: ns_list.append(['None', value]) elif not value: ns_list.append([name, '""']) else: ns_list.append([name, value]) ns_list = sorted(ns_list, key=lambda x: x[0]) for name, value in ns_list: print_ind(depth + 3, '{}: {}'.format(name, value)) if len(node): print_ind(depth + 2, 'children:') if node.text: print_ind(depth + 3, '- Text: "{}"'.format(escape_text(node.text))) for child in node: print_node(child, depth + 3) elif node.text: print_ind(depth + 2, 'children:') print_ind(depth + 3, '- Text: "{}"'.format(escape_text(node.text))) if node.tail: print_ind(depth, '- Text: "{}"'.format(escape_text(node.tail))) tree = etree.parse(sys.argv[1]) root = tree.getroot() print('Document:') print_node(root, 1)