-
Notifications
You must be signed in to change notification settings - Fork 37
/
microdata.py
executable file
·251 lines (192 loc) · 6.41 KB
/
microdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env python
import sys
import json
import html5lib
from collections import defaultdict
from urllib.request import urlopen, Request
USER_AGENT = "microdata.py <https://github.com/edsu/microdata>"
def main():
if len(sys.argv) < 2:
print("Usage: microdata <URL>")
sys.exit(1)
for url in sys.argv[1:]:
sys.stderr.write(url + "\n")
microdata = {}
microdata['items'] = items = []
req = Request(url, headers={"User-Agent": USER_AGENT})
for item in get_items(urlopen(req)):
items.append(item.json_dict())
print(json.dumps(microdata, indent=2))
def get_items(location, encoding=None):
"""
Pass in a string or file-like object and get a list of Items present in the
HTML document.
"""
dom_builder = html5lib.treebuilders.getTreeBuilder("dom")
parser = html5lib.HTMLParser(tree=dom_builder)
if encoding:
tree = parser.parse(location, transport_encoding=encoding)
else:
tree = parser.parse(location)
return _find_items(tree)
class Item(object):
"""
A class for representing a microdata Item. Item properties are accessible
as standard Python properties, which return either a unicode string
or another Item.
"""
def __init__(self, itemtype=None, itemid=None):
"""Create an Item, with an optional itemptype and/or itemid.
"""
# itemtype can be a space delimited list
if itemtype:
self.itemtype = [URI(i) for i in itemtype.split(" ")]
if itemid:
self.itemid = URI(itemid)
self.props = {}
def __getattr__(self, name):
return self.get(name)
def set(self, name, value):
"""Set an item's property
"""
if name in self.props:
self.props[name].append(value)
else:
self.props[name] = [value]
def get(self, name):
"""Get an item's property. In cases where there are multiple values for
a given property this returns only the first. If the property is
not set None is returned.
"""
values = self.get_all(name)
if len(values) > 0:
return values[0]
return None
def get_all(self, name):
"""Get all the values for a given property. If the property is not
set for the Item an empty list is returned.
"""
if name in self.props:
return self.props[name]
else:
return []
def json(self):
"""Returns the Item expressed as JSON. If there's a better JSON
representation please let me know :-)
"""
return json.dumps(self.json_dict(), indent=2)
def json_dict(self):
"""Returns the item, and its nested items as a python dictionary.
"""
item = {}
if self.itemtype:
item['type'] = [i.string for i in self.itemtype]
if self.itemid:
item['id'] = self.itemid.string
item['properties'] = props = defaultdict(list)
for name, values in self.props.items():
for v in values:
if isinstance(v, Item):
props[name].append(v.json_dict())
elif isinstance(v, URI):
props[name].append(v.string)
else:
props[name].append(v)
return item
class URI(object):
def __init__(self, string):
self.string = string
def __eq__(self, other):
if isinstance(other, URI):
return self.string == other.string
return False
def __repr__(self):
return self.string
# what follows are the guts of extracting the Items from a DOM
property_values = {
'meta': 'content',
'audio': 'src',
'embed': 'src',
'iframe': 'src',
'img': 'src',
'source': 'src',
'video': 'src',
'a': 'href',
'area': 'href',
'link': 'href',
'object': 'data',
'time': 'datetime',
}
def _find_items(e):
items = []
unlinked = []
if _is_element(e) and _is_itemscope(e):
item = _make_item(e)
unlinked = _extract(e, item)
items.append(item)
for unlinked_element in unlinked:
items.extend(_find_items(unlinked_element))
else:
for child in e.childNodes:
items.extend(_find_items(child))
return items
def _extract(e, item):
# looks in a DOM element for microdata to assign to an Item
# _extract returns a list of elements which appeared to have microdata
# but which were not directly related to the Item that was passed in
unlinked = []
for child in e.childNodes:
itemprop = _attr(child, "itemprop")
itemscope = _is_itemscope(child)
if itemprop and itemscope:
for i in itemprop.split(" "):
nested_item = _make_item(child)
unlinked.extend(_extract(child, nested_item))
item.set(i, nested_item)
elif itemprop:
value = _property_value(child)
# itemprops may also be in a space delimited list
for i in itemprop.split(" "):
item.set(i, value)
unlinked.extend(_extract(child, item))
elif itemscope:
unlinked.append(child)
else:
unlinked.extend(_extract(child, item))
return unlinked
# helper functions around python's minidom
def _attr(e, name):
if _is_element(e) and e.hasAttribute(name):
return e.getAttribute(name)
return None
def _is_element(e):
return e.nodeType == e.ELEMENT_NODE
def _is_itemscope(e):
return _attr(e, "itemscope") is not None
def _property_value(e):
value = None
attrib = property_values.get(e.tagName, None)
if attrib in ["href", "src"]:
value = URI(e.getAttribute(attrib))
elif attrib:
value = e.getAttribute(attrib)
else:
value = e.getAttribute("content") or _text(e)
return value
def _text(e):
chunks = []
if e.nodeType == e.TEXT_NODE:
chunks.append(e.data)
elif hasattr(e, 'tagName') and e.tagName == 'script':
return ''
for child in e.childNodes:
chunks.append(_text(child))
return ''.join(chunks)
def _make_item(e):
if not _is_itemscope(e):
raise Exception("element is not an Item")
itemtype = _attr(e, "itemtype")
itemid = _attr(e, "itemid")
return Item(itemtype, itemid)
if __name__ == "__main__":
main()