forked from edwardhuahan/twitter-covid-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
110 lines (83 loc) · 3.82 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""
DEC 09 2021
CSC Final Project 2021
This script is used to scrape tweets from twitter for use in my csc110 final project
The scraper accepts a range of dates and outputs the tweets into a csv file.
Each line of data in the csv file will be of the following:
<status id>, <status date>, <content>
The following is the csv header:
id,date,contents
EVERYTIME YOU START A NEW SCRAPE REQUEST FOR TWITTER API THERE IS A CHANCE IT'S GOING TO
DECLINE GIVING SNSCRAPER A GUEST TOKEN
WHICH MEANS THAT EVERY TIME IT SEARCHES FOR A NEW DAY THERE'S A CHANCE IT BREAKS
THERE'S NO WAY AROUND THIS EXCEPT REQUEST MORE TWEETS EACH DAY TO MINIMIZE THE RISK
This file is Copyright (c) 2021 Edward Han, Zekun Liu (ノ◕ヮ◕)ノ*:・゚✧, Arvin Gingoyon
"""
import csv
from datetime import datetime, timedelta
import snscrape.modules.twitter as snt
def scrape(start_date_raw: str, max_tweets: int) -> None:
""" Takes a start date and tweet limit and exports tweets from date range
to export into a CSV file containing tweet id, tweet content and tweet date.
Preconditions:
- datetime.strptime(start_date_raw, "%Y/%m/%d") <= datetime.now()
- max_tweets >= 0
:param start_date_raw: Str, A date in the format YYYY/MM/DD
:param max_tweets: Int, The number of total tweets you want
:return: None, creates a csv file with the scraped tweets
"""
now = datetime.now()
current_date = now.date()
# now = now.strftime('%Y-%m-%d')
# user inputs
# has been moved to the parameters
# Convert start_date into a datetime object
start_date = datetime.strptime(start_date_raw, "%Y/%m/%d")
start_date = start_date.date()
# find tweets per day
day_delta = current_date - start_date
day_delta = day_delta.days
tweets_per_day = max_tweets // (day_delta + 1)
# setting up the csv file and writer
# open csv file, make new one if file not found
with open('data.csv', mode='a', newline='', encoding='utf8') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['id', 'date', 'contents', ])
# scrape data
while start_date <= current_date:
dt = start_date.strftime('%Y-%m-%d')
dtn = next_day(start_date)
dtn = dtn.strftime('%Y-%m-%d')
search_term = f'covid OR coronavirus OR covid19' \
f' OR corona lang:en since:{dt} until:{dtn}'
for i, tweet in enumerate(snt.TwitterSearchScraper(search_term).get_items()):
if i > tweets_per_day:
break
print(f'Found tweet number {i} for date {dt}')
csv_writer.writerow([tweet.id, tweet.date, tweet.content])
start_date += timedelta(days=1)
csv_file.close()
def next_day(original_date: datetime.date) -> datetime.date:
""" Takes a datetime and returns the day after the date specified
in the object. This function is needed because simply adding
datetime.timedelta(days=1) to datetime.datetime.now() is not testable
>>> example = datetime(2021, 12, 1)
>>> next_day(example)
datetime.datetime(2021, 12, 2, 0, 0)
"""
return original_date + timedelta(days=1)
if __name__ == "__main__":
import python_ta
import python_ta.contracts
python_ta.contracts.DEBUG_CONTRACTS = False
python_ta.contracts.check_all_contracts()
# When you are ready to check your work with python_ta, uncomment the following lines.
# (Delete the "#" and space before each line.)
# IMPORTANT: keep this code indented inside the "if __name__ == '__main__'" block
python_ta.check_all(config={
'extra-imports': ['snscrape.modules.twitter', 'csv', 'datetime'],
'allowed-io': ['scrape'],
'max-line-length': 100,
'disable': ['R1705', 'C0200']
})
scrape('2021/12/09', 1000) # Default values