-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
executable file
·114 lines (100 loc) · 3.02 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python3
import csv
import sys
import glob
import gzip
import json
from twarc.json2csv import get_row, get_headings
queries = [
{
"name": "Darren Seals",
"screen_name": "KingDSeals",
"user_id": "2747681903"
},
{
"name": "Edward Crawford",
"screen_name": "eyeFLOODpanties",
"user_id": "84946406"
},
{
"name": "Bassem Masri",
"screen_name": "bassem_masri",
"user_id": "2734647354"
},
{
"name": "Deandre Joshua",
"screen_name": None,
"user_id": None
},
{
"name": "Danye Jones",
"screen_name": None,
"user_id": None
}
]
data_dirs = [
{
"name": "Beyond the Hashtags",
"glob": "data/AE0A86DE-E17D-438E-BCDF-AA1F04851CAF/data/tweets/*.txt.gz"
},
{
"name": "BlackLivesMatter",
"glob": "data/4D41FEA7-9E85-45B8-9499-362212278CAB/data/*.json.gz",
},
{
"name": "Ferguson Scrape",
"glob": "data/D651C3F6-5619-4A42-A8BC-7C22B7A9A44A/data/*.json.gz",
},
{
"name": "Ferguson",
"glob": "data/fe28a093-d3f4-42d7-83ba-f5ba1b1cc765/data/*.json.gz"
}
]
def main():
out = csv.writer(open("results.csv", "w"))
out.writerow(get_headings() + ['dataset', 'file', 'user_match', 'match_type'])
for d in data_dirs:
for f in glob.glob(d['glob']):
sys.stdout.write('\n{}:'.format(f))
sys.stdout.flush()
process_file(d['name'], f, out)
def process_file(source, json_path, out):
for line in gzip.open(json_path):
try:
tweet = json.loads(line)
except:
continue
match = tweet_match(tweet)
if match:
sys.stdout.write('.')
sys.stdout.flush()
out.writerow(get_row(tweet) + [source, json_path] + match)
def tweet_match(t):
for q in queries:
# tweet by the user?
if q['user_id'] == t['user']['id_str']:
if t['in_reply_to_user_id_str']:
return [q['name'], 'replied']
elif t.get('retweeted_status') is not None:
return [q['name'], 'retweeted']
else:
return [q['name'], 'posted']
# someone replied to a tweet by the user?
if q['user_id'] and q['user_id'] == t['in_reply_to_user_id_str']:
return [q['name'], 'replied to']
# user reweeted by someone else?
rt = t.get('retweeted_status')
if rt and q['user_id'] == rt['user']['id_str']:
return [q['name'], 'user retweeted']
# user mentioned by someone else?
for u in t['entities'].get('user_mentions', []):
if q['user_id'] == u['id_str']:
return [q['name'], 'user mentioned']
# someone mentioned them by name?
text = t.get('text') or t.get('full_text')
text = text.lower()
if q['name'].lower() in text:
return [q['name'], 'name mention']
return None
if __name__ == "__main__":
main()