-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathxml2json.py
283 lines (269 loc) · 11.7 KB
/
xml2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import re, textwrap, json, sys
from collections import OrderedDict
from Queue import *
from pprint import pprint
from lxml import etree
from nltk.corpus import stopwords
width = 80
content_url = 'http://www.projectaon.org/en/xhtml/lw/02fotw/'
def processPara(para):
para = re.sub('</?p/?>', '', para)
para = re.sub('</?choice( idref=".*?")?/?>', '', para)
para = re.sub('</?link-text/?>', '', para)
para = re.sub('</?cite/?>', '', para)
para = para.replace('<ch.apos/>', "'")
para = para.replace('<ch.endash/>', "-")
para = para.replace('<ch.emdash/>', "-")
para = para.replace('<ch.ellips/>', "...")
para = para.replace('<ch.thinspace/>', "")
para = para.replace('<ch.frac12/>', "1/2")
para = re.sub('</?quote/?>', "\"", para)
para = re.sub('</?onomatopoeia>', '', para)
para = re.sub('<footref.*?/>', '', para)
para = re.sub('</?signpost>', '', para)
para = re.sub('</?description>', '', para)
para = re.sub('</?blockquote>', '', para)
para = re.sub('</?bookref.*?>', '', para)
return textwrap.wrap(para, width)
sections = OrderedDict()
custom = json.load(open('fotw_custom.json'))
parser = etree.XMLParser(resolve_entities=False)
tree = etree.parse('fotw.xml', parser=parser)
root = tree.getroot()
stopwords = set(stopwords.words('english'))
for w in ['turn', 'wish', 'want', 'turning', 'rather', 'would', 'along', 'upon', 'another']:
stopwords.add(w)
for sect_elem in root.findall('.//section[@class="numbered"]')[1:]:
#for sect_elem in root.findall('.//section[@id="sect%s"]' % 181):
sect_id = sect_elem.find('.//title').text
sect_paras = []
choices = []
combat = {}
enemies = []
rnt_found = False
ac_found = False
stats_found = False
undead_found = False
sommerswerd_found = False
immune_to_mindblast_found = False
illustration_found = False
must_eat = False
list_found = False
footref_found = False
mindforce_found = False
for item in sect_elem.find('data'):
s = etree.tostring(item).strip()
for subitem in item:
if subitem.tag == 'footref':
footref_found = True
if '<a idref="random">Random Number Table</a>' in s:
rnt_found = True
s = s.replace('<a idref="random">Random Number Table</a>', 'Random Number Table')
for a in ['COMBAT SKILL', 'ENDURANCE']:
if '<typ class="attribute">%s</typ>' % a in s:
s = s.replace('<typ class="attribute">%s</typ>' % a, a)
stats_found = True
if '<a idref=\"action\">Action Chart</a>' in s:
ac_found = True
s = s.replace('<a idref=\"action\">Action Chart</a>', 'Action Chart')
if 'undead' in s.lower():
undead_found = True
if 'sommerswerd' in s.lower():
sommerswerd_found = True
if 'immune' in s.lower() and 'mindblast' in s.lower():
immune_to_mindblast_found = True
if 'Meal' in s and 'must' in s:
must_eat = True
if 'Mindforce' in s:
mindforce_found = True
if item.tag == 'p':
sect_paras.append(processPara(s))
if item.tag == 'signpost':
s = processPara(s)[0]
spacer = ' ' * ((width - len(s)) / 2)
sect_paras.append([spacer + s])
if item.tag == 'ul':
for li in item:
t = etree.tostring(li).strip()
for a in ['COMBAT SKILL', 'ENDURANCE']:
if '<typ class="attribute">%s</typ>' % a in t:
t = t.replace('<typ class="attribute">%s</typ>' % a, a)
stats_found = True
lis = processPara(t)[0]
lis = re.sub('</?li>', '', lis)
sect_paras.append(['* %s' % lis])
list_found = True
elif item.tag == 'combat':
e = {'name': item.find('.//enemy').text,
'combat_skill': int(item.find('..//enemy-attribute[@class="combatskill"]').text),
'endurance': int(item.find('..//enemy-attribute[@class="endurance"]').text)}
enemies.append(e)
sect_paras.append(['%s: COMBAT SKILL %d, ENDURANCE %d' % (e['name'], e['combat_skill'], e['endurance'])])
elif item.tag == 'choice':
choice_sect_id = re.search('idref="sect(\d+)"', s).group(1)
choices.append({'section': choice_sect_id, 'text': '\n'.join(processPara(s))})
elif item.tag == 'illustration':
illustration_found = True
desc = ''
src = ''
for sub in item:
if sub.tag == 'meta' and len(sub) > 1 and sub[1].tag == 'description':
desc = ' ' + etree.tostring(sub[1]).rstrip()
if sub.get('class') == 'html':
src = sub.get('src')
if sub.get('class') == 'text':
for subsub in sub:
desc += ' '.join([line.strip() for line in etree.tostring(subsub).split()])
desc = '\n'.join(processPara(desc)).strip()
desc = ': %s' % desc if desc else ''
sect_paras.append(['%s/%s[Illustration]%s' % (content_url, src, desc)])
sect_text = '\n\n'.join(['\n'.join(p) for p in sect_paras])
is_random_pick = False
if rnt_found and re.search('\d-\d', choices[0]['text']):
is_random_pick = True
for choice in choices:
if re.search('(\d+)-(\d+)', choice['text']):
choice['range'] = [int(s) for s in re.search('(\d+)-(\d+)', choice['text']).groups()]
else:
n = int(re.search('(\d)', choice['text']).group(1))
choice['range'] = [n, n]
else: #if len(choices) > 1:
for choice in choices:
words = []
for w in re.split("[^A-Za-z0-9'-]+", choice['text']): # every nonalpha except "'" and "-"
w = w.lower()
if (len(w) < 3 and w != 'go') or w in stopwords or re.match('\d+', w): continue
words.append(w)
if words:
choice['words'] = words
if enemies:
combat['enemies'] = enemies
for i, choice in enumerate(choices):
if 'win' in choice['text'] or 'if you kill' in choice['text'].lower():
combat['win'] = {'choice': i}
elif 'evade' in choice['text']:
combat['evasion'] = {'choice': i}
if re.search('at any time|stage', choice['text']):
combat['evasion']['n_rounds'] = 0
elif 'after two rounds' in choice['text']:
combat['evasion']['n_rounds'] = 2
section = {'text': sect_text, 'choices': choices}
if combat:
if undead_found or sommerswerd_found:
assert len(combat['enemies']) == 1
combat['enemies'][0]['double_damage'] = True
if immune_to_mindblast_found:
assert len(combat['enemies']) == 1
combat['enemies'][0]['immune'] = "Mindblast"
if mindforce_found:
assert len(combat['enemies']) == 1
combat['enemies'][0]['has_mindforce'] = True
if 'win' not in combat and len(choices) == 1:
combat['win'] = {'choice': 0}
section['combat'] = combat
if is_random_pick: section['is_random_pick'] = True
if must_eat: section['must_eat'] = True
# merge custom content
if sect_id in custom['sections']:
cust_sect = custom['sections'][sect_id]
if 'alternate_choices' in cust_sect:
section['alternate_choices'] = True
# if 'chain_choices' in cust_sect:
# section['chain_choices'] = True
if 'no_ambiguity' in cust_sect:
section['no_ambiguity'] = True
if 'is_special' in cust_sect:
section['is_special'] = True
if 'items' in cust_sect:
section['items'] = cust_sect['items']
if 'endurance' in cust_sect:
section['endurance'] = cust_sect['endurance']
if 'combat' in cust_sect:
section['combat'].update(cust_sect['combat'])
if 'options' in cust_sect:
section['options'] = cust_sect['options']
if 'must_eat' in cust_sect:
if cust_sect['must_eat']:
section['must_eat'] = cust_sect['must_eat'] # possibly a int
else:
del section['must_eat']
if 'text' in cust_sect:
section['text'] = cust_sect['text']
if 'n_items_to_pick' in cust_sect:
section['n_items_to_pick'] = cust_sect['n_items_to_pick']
if 'trim_choices' in cust_sect:
section['trim_choices'] = True
if 'reduce_choices' in cust_sect:
section['reduce_choices'] = True
cc_sections = []
for custom_choice in custom['sections'][sect_id].get('choices', []):
if custom_choice['section'] in cc_sections:
exit('Error in custom section %s: duplicate choice sections' % sect_id)
cc_sections.append(custom_choice['section'])
for custom_choice in custom['sections'][sect_id].get('choices', []):
# no key to match here, so we got to match using choice.section (thus the need to search)
#print custom_choice['section']
found = False
for choice in section['choices']:
if choice.get('section') == custom_choice['section'] and 'is_artificial' not in custom_choice:
choice.update(custom_choice)
if '+words' in custom_choice:
choice['words'].extend(custom_choice['+words'])
del choice['+words']
if '-words' in custom_choice:
choice['words'] = [w for w in choice['words'] if w not in custom_choice['-words']]
del choice['-words']
found = True
break
if 'is_artificial' in custom_choice:
choices.append(custom_choice)
elif not found:
exit('Error in custom section %s: choice for section %s cannot be found' % (sect_id, custom_choice['section']))
sections[sect_id] = section
# special case reporting
report = []
if ac_found:
report.append('ac')
if rnt_found and not is_random_pick:
report.append('rnt')
if stats_found:
report.append('stats')
if footref_found:
report.append('footref')
# if illustration_found:
# report.append('illustration')
# if must_eat:
# report.append('must eat')
if list_found:
report.append('list')
if report and sect_id not in custom['sections']: #True:
print '%s: %s' % (sect_id, ', '.join(report))
section_od = OrderedDict()
for sect_id in range(1, 351):
sect_id = str(sect_id)
section_od[sect_id] = sections[sect_id]
# q = Queue()
# visited = set()
# q.put('1')
# section_od = OrderedDict()
# to_set = []
# while not q.empty():
# sect_id = q.get()
# to_set.append(sect_id)
# section_od[sect_id] = sections[sect_id]
# # if sect_id == '197': continue
# for choice in sections[sect_id]['choices']:
# if choice['section'] not in visited:
# q.put(choice['section'])
# visited.add(choice['section'])
result_od = OrderedDict()
setup_od = OrderedDict()
for f in ['sequence', 'disciplines', 'weapons', 'equipment']:
setup_od[f] = custom['setup'][f]
custom['setup'] = setup_od
for f in ['prompt', 'intro_sequence', 'setup', 'synonyms']:
result_od[f] = custom[f]
result_od['sections'] = section_od
#json.dump(result_od, open('fotw_generated.json', 'w'), indent=4)
open('fotw.json', 'w').write('\n'.join([line.rstrip() for line in json.dumps(result_od, indent=4).split('\n')]))
print 'produced %d sections' % len(result_od['sections'])