-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_database.py
143 lines (125 loc) · 4.47 KB
/
create_database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import sqlite3
import pandas as pd
import dropbox
import numpy as np
import io
from configparser import ConfigParser
from datetime import datetime
from os.path import exists
config = ConfigParser()
config.read('dropbox_token.ini')
DROPBOX_TOKEN = config['token']['token']
dbx = dropbox.Dropbox(DROPBOX_TOKEN)
#method to get the share of a file from its path
def getShareLinkFromPath(path):
try: # if we have already created a sharing link, we can't create a new one, but we can get the old one
shared_link_metadata = dbx.sharing_create_shared_link_with_settings(path)
except:
shared_link_metadata = dbx.sharing_list_shared_links(path).links[0]
shared_link = shared_link_metadata.url
shared_link= shared_link[:-1]+'1'
return shared_link
#method to read a file in from the path
def readDfFromPath(path,encoding='latin-1'):
metadata, file = dbx.files_download(path=path)
with io.BytesIO(file.content) as stream:
df = pd.read_csv(stream, encoding=encoding)
df = adaptDf(path, df)
return df
def adaptDf(path, df):
if "reddit" in str(path).lower():
return adaptReddit(df)
elif "cnn" in str(path).lower():
df['platform'] = 'CNN'
df['country'] = None
return df
elif "facebook" in str(path).lower():
df['platform'] = "facebook"
df['country'] = None
return df
elif "new york times" in str(path).lower():
df = adaptNYT(df)
return df
elif "the guardian" in str(path).lower():
df['platform'] = "The Guardian"
df['country'] = None
df = randomKeep(df, 0.5)
return df
elif "twitter" in str(path).lower():
df = adaptTwitter(df)
return df
return df
def randomKeep(df, prob):
keep = np.random.random((df.shape[0],))
df['keep'] = keep
df = df[df.keep < prob]
return df
def adaptReddit(df):
df['platform'] = "reddit"
df['bodyText'] = df['title']
df['sentiment'] = df['title-compound']
df['date'] = df['created_utc']
df['country'] = None
df = randomKeep(df, 0.075)
return df
def adaptNYT(df):
df['platform'] = df['source']
df["country"] = None
df['bodyText'] = df['lead_paragraph']
df['date'] = df['pub_date']
df['sentiment'] = df['sentiment_pos']
return df
def adaptTwitter(df):
df['platform'] = "Twitter"
df['country'] = None
df['sentiment'] = df['compound']
df['bodyText'] = df['text']
df = randomKeep(df, 0.5)
return df
#method to get the names of the files in a path
def getFileNames(path):
files = dbx.files_list_folder(path).entries
files_list = []
for file in files:
if isinstance(file, dropbox.files.FileMetadata):
files_list.append(file.name)
return files_list
#method to fill the significant_events table with the data from two files
def insertSigEventsFiles():
for file_path in sig_events_files:
df = readDfFromPath(file_path, encoding='utf-8')
df['event'] = df['description']
print(df.head())
df = df[['date', 'event']]
if exists('significant_events.csv'):
df.to_csv('significant_events.csv', mode='a', index=False, header=False)
else:
df.to_csv('significant_events.csv', index=False)
'''
for row in df.rows:
date = row.date
event = row.event
query = "INSERT INTO significant_events VALUES (" + date + ", " + event + ")"
con.execute(query)
'''
#method to insert the posts of one of the paths to tagged files
def insertPosts(folder_path, csv_name):
path = '/DVA_Datasets' + folder_path
files = getFileNames(path)
for file_name in files:
print(file_name)
df = readDfFromPath(path+ '/' + file_name)
df = df[['platform', 'bodyText', 'sentiment', 'date', 'country']]
csv_path = csv_name + '_filtered.csv'
if exists(csv_path):
df.to_csv(csv_path, mode='a', index=False, header=False)
else:
df.to_csv(csv_path, index=False)
sig_events_files = ['/DVA_Datasets/sig_ev_cleaned.csv']
posts_folder_paths = [('/twitter/sentiments', 'twitter'), ('/CNN/sentiments', 'cnn'), ('/Facebook/facebook_posts/sentiments/sentiments/sentiments', 'facebook'), ('/New York Times', 'nyt'), ('/Reddit/tagged', 'reddit'), ('/The Guardian/sentiments', 'guardian')]
print('INSERTING SIGNIFICANT EVENTS')
insertSigEventsFiles()
print("INSERTING POSTS")
for path in posts_folder_paths:
print(path[0])
insertPosts(path[0], path[1])