-
Notifications
You must be signed in to change notification settings - Fork 4
/
util.py
131 lines (101 loc) · 3.31 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import io
import re
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
def xml_encode(s):
return (s
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace("'", "'")
.replace('"', """))
class XMLNode(object):
def __iter__(self):
return iter(())
def flatten(self):
q = [self]
while q:
x = q.pop()
yield x
q.extend(reversed(list(x)))
def iter(self, tag_name):
for x in self.flatten():
if isinstance(x, XMLTag) and x.tag == tag_name:
yield x
def itertext(self):
for x in self.flatten():
if isinstance(x, XMLText):
yield x.string
class XMLTag(XMLNode):
__slots__ = ("tag", "attrib", "contents")
def __init__(self, tag, attrib, contents):
self.tag = tag
self.attrib = attrib
self.contents = contents
def __iter__(self):
return iter(self.contents)
class XMLText(XMLNode):
__slots__ = ("string",)
def __init__(self, string):
self.string = string
class _Handler(ContentHandler):
__slots__ = ("finished_tags", "stack")
def __init__(self):
self.finished_tags = []
self.stack = [] # stack of XMLTag
def reset(self):
self.finished_tags.clear()
self.stack.clear()
# def startDocument(self):
# print("startDocument()")
# def endDocument(self):
# print("endDocument()")
def startElement(self, name, attrs):
# print("startElement({!r}, {!r})".format(name, attrs))
self.stack.append(XMLTag(name, attrs, []))
def endElement(self, name):
# print("endElement({!r})".format(name))
tag = self.stack.pop()
if self.stack:
self.stack[-1].contents.append(tag)
else:
self.finished_tags.append(tag)
# print(" --> DONE: {}".format(self.finished_tags[-1]))
def characters(self, text):
# print("characters({!r})".format(text))
if len(self.stack) > 0:
self.stack[-1].contents.append(XMLText(text))
class XMLMuncher(object):
__slots__ = ("parser", "handler")
def __init__(self):
self.reset()
def reset(self):
self.parser = make_parser()
self.handler = _Handler()
self.parser.setContentHandler(self.handler)
self.parser.feed("""
<?xml version="1.0"?>
<!DOCTYPE coq [
<!ENTITY nbsp " ">
]>
<coq>
""".strip())
self.handler.reset()
def process(self, buf):
self.parser.feed(buf)
yield from self.handler.finished_tags
self.handler.finished_tags.clear()
def byte_to_character_offset(text, byte_offset, charset):
"""Convert a byte offset to a character offset.
Generally, this function obeys the law:
text[:_byte_to_character_offset(text, byte_offset)] ==
text.encode(charset)[:byte_offset].decode(charset)
Note that if byte_offset falls in the middle of a character, this function
silently rounds up to the next character.
"""
position = 0
bytes_spanned = 0
while bytes_spanned < byte_offset:
bytes_spanned += len(text[position].encode(charset))
position += 1
return position