-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathextract.py
48 lines (45 loc) · 1.24 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import re
import json
# f = open('abstracts_en.ttl','r', encoding='utf-8')
# l = f.readlines()
# lines = []
# buffer = []
# for i in l:
# if i == '\n':
# lines.append(buffer)
# buffer = list()
# else:
# buffer.append(i[:-1])
# abstracts = {}
# mode = 0
# count = 0
# for line in lines:
# context = None
# for sentence in line:
# if mode == 1:
# subjectsearch = re.search(r'<http://dbpedia.org/resource/(.*)/abstract', line[0])
# subject = subjectsearch.group(1)
# abstracts[subject] = context
# mode = 0
# matches = re.search(r'nif:isString \"\"\"(.*)\"\"\"\^\^xsd:string;', sentence)
# if matches:
# context = matches.group(1)
# mode = 1
# if count == 10:
# break
# with open('abstracts_data.json', 'w') as outfile:
# json.dump(abstracts, outfile)
from pprint import pprint
with open('wikiData.json') as f:
data = json.load(f)
# pprint(data['data'])
count = 0
for topic in data['data'][:10]:
print(topic['paragraphs'][0]['context'])
print(topic['paragraphs'][1]['context'])
print(topic['paragraphs'][2]['context'])
count += 1
print(count)
print('\n')
if count == 10:
break