-
Notifications
You must be signed in to change notification settings - Fork 0
/
registry_data.py
88 lines (73 loc) · 3.6 KB
/
registry_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os.path as op
import eventregistry as er
import pandas as pd
def fetch_event_articles(api_key, min_articles=500, force=False, save_on_api_fail=True, csv_file=None):
event_registry = er.EventRegistry(apiKey=api_key, repeatFailedRequestCount=2)
# Single query to collect event ids
all_events_gzip_file = op.join('csv', 'events_min%d.csv' % min_articles) + '.gz'
if not force and op.exists(all_events_gzip_file):
df_events = pd.read_csv(all_events_gzip_file, compression='gzip')
else:
event_data = []
qei = er.QueryEventsIter(
lang='eng', minArticlesInEvent=min_articles, maxArticlesInEvent=min_articles * 10)
for event in qei.execQuery(event_registry, maxItems=1001):
event_data.append(event)
df_events = pd.DataFrame(event_data)
df_events.to_csv(all_events_gzip_file, encoding='utf-8', compression='gzip')
del event_data
# Uncache csv file.
if not force and op.exists(csv_file):
print("Loading articles from disk...")
df_articles = pd.read_csv(csv_file)
else:
event_uris = df_events.uri.tolist()
event_uris = [ev for ev in event_uris if ev[:3] == 'eng']
print("Downloading articles for %d events..." % len(event_uris))
# Loop to retrieve all articles for an event.
return_info = er.ReturnInfo(
articleInfo=er.ArticleInfoFlags(
bodyLen=-1,
concepts=True,
categories=True,
originalArticle=True))
all_articles = []
api_failed = False
for uri in event_uris:
print "current uri: ", uri
current_event_data = []
event_gzip_file = op.join('csv', 'event-%s.csv.gz' % uri)
if not force and op.exists(event_gzip_file):
tmp_df = pd.read_csv(event_gzip_file, compression='gzip')
elif api_failed:
print("\tSkipping; API failed.")
try:
query_iter = er.QueryEventArticlesIter(uri)
for article in query_iter.execQuery(event_registry, lang="eng", returnInfo=return_info):
current_event_data.append(article)
except TypeError:
# This is how API errors come through.
if save_on_api_fail:
print("\tWARNING: API failed. Skipping.")
api_failed = True # end loop; we can't continue.
continue
else:
raise
# Specify columns, so that we skip any empty events.
tmp_df = pd.DataFrame(current_event_data, columns=[
'body', 'categories', 'concepts', 'date', 'dateTime', 'eventUri',
'id', 'isDuplicate', 'lang', 'originalArticle', 'sim', 'source',
'time', 'title', 'uri', 'url'])
tmp_df.to_csv(event_gzip_file, encoding='utf-8', compression='gzip')
if len(tmp_df) == 0:
print("WARNING: event contains no articles.")
# print "shape of df: {}".format(tmp_df.shape)
# print "unique url: {}".format(len(set(tmp_df['url'])))
all_articles.append(tmp_df)
# Combine all news articles into a single dataframe.
df_articles = pd.concat(all_articles)
csv_file = csv_file or 'articles-min%d.csv' % min_articles
df_articles.to_csv(csv_file, encoding='utf-8')
return df_events, df_articles
if __name__ == '__main__':
fetch_event_articles(api_key="8b86c30c-cb8f-4d3f-aa84-077f3090e5ba")