1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
from xml.sax.saxutils import unescape
__all__ = [ "ParseError", "XMLParser", ]
def unescapeattr(d):
return unescape(d.replace("	", "\t").replace(" ", "\r").replace(" ", "\n"))
class ParseError(Exception):
pass
class XMLParser(object):
class __Tag(object):
def __init__(self,
skip=0,
state="tagname"):
self.name = ""
self.data = []
self.attrs = {}
self.attrName = []
self.attrData = []
self.attrQuote = '"'
self.end = False
self.skip = skip
self.state = state
def __init__(self, html=0, target=None, encoding=None):
assert target, "'target' must be provided. Standard TreeBuilder currently not implemented."
self.target = target
def feed(self, text):
tag = None
tree = []
if not isinstance(text, str):
text = text.decode("UTF-8")
for i, c in enumerate(text):
if tag and tag.skip > 0:
tag.skip -= 1
continue
if tag and tag.end:
if tag.state not in ("comment", "head", "cdata"):
self.target.data(unescape("".join(tag.data)))
self.target.end(tag.name)
tree.pop()
prevTag = tree[-1] if tree else None
if tag.state == "cdata" and prevTag:
prevTag.data.extend(tag.data)
tag = prevTag
# Fall through
if tag and tag.state == "comment":
if text.startswith("-->", i):
tag.skip = 2
tag.end = True
continue
if tag and tag.state == "head":
if text.startswith("?>", i):
tag.skip = 1
tag.end = True
continue
if tag and tag.state == "cdata":
if text.startswith("]]>", i):
tag.skip = 2
tag.end = True
else:
tag.data.append(c)
continue
if tag and tag.state == "attrname":
if c == "=":
if text.startswith('="', i):
tag.attrQuote = '"'
elif text.startswith("='", i):
tag.attrQuote = "'"
else:
raise ParseError("Invalid attribute quoting.")
tag.skip = 1 # skip quote
tag.state = "attrdata"
tag.attrData = []
else:
tag.attrName.append(c)
continue
if tag and tag.state == "attrdata":
if c == tag.attrQuote:
tag.state = "taghead"
tag.attrs["".join(tag.attrName)] = unescapeattr("".join(tag.attrData))
tag.attrName = []
tag.attrData = []
else:
tag.attrData.append(c)
continue
if tag and tag.state == "taghead":
if c == ">":
self.target.start(tag.name, tag.attrs)
tag.state = "data"
elif text.startswith("/>", i):
self.target.start(tag.name, tag.attrs)
tag.skip = 1
tag.end = True
elif not c.isspace():
tag.attrName = [c]
tag.state = "attrname"
continue
if tag and tag.state == "tagname":
if c.isspace() or c == ">":
if c == ">":
tag.state = "data"
self.target.start(tag.name, tag.attrs)
else:
tag.state = "taghead"
else:
tag.name += c
continue
if text.startswith("<!--", i):
tag = self.__Tag(state="comment", skip=3)
tree.append(tag)
continue
if text.startswith("<?", i):
tag = self.__Tag(state="head", skip=1)
tree.append(tag)
continue
if text.startswith("<![CDATA[", i):
tag = self.__Tag(state="cdata", skip=8)
tree.append(tag)
continue
if c == "<" and not text.startswith("</", i):
tag = self.__Tag()
tree.append(tag)
continue
if tag and tag.state == "data":
if text.startswith("</" + tag.name + ">", i):
tag.skip = len("</" + tag.name + ">") - 1
tag.end = True
elif text.startswith("</", i):
raise ParseError("Invalid end tag")
else:
tag.data.append(c)
continue
if c.strip():
raise ParseError("Trailing characters.")
def close(self):
pass
|