-
Notifications
You must be signed in to change notification settings - Fork 0
/
stackoverflow_code_retrieval.py
138 lines (113 loc) · 4.99 KB
/
stackoverflow_code_retrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from requests import get
import time
from requests.exceptions import ProxyError
from Code_retrieval.GetTags import GetTags
from General.DigitalOceanAPI import DigitalOceanAPI
from bs4 import BeautifulSoup
from Code_retrieval.GetCode import GetCode
from Code_retrieval.GetAuthor import GetAuthor
from Code_retrieval.GetAccepted import GetAccepted
from General.GetPost import GetPost
from UserSelection import UserSelection
from General.DatabaseConnection import DatabaseConnection
from General.RequestTimeout import RequestTimeout
base_url = 'https://ru.stackoverflow.com/questions/'
def create_droplet(name):
droplet = DigitalOceanAPI.create_droplet(name)
status = DigitalOceanAPI.check_status(droplet)
while status == "in-progress":
status = DigitalOceanAPI.check_status(droplet)
time.sleep(5)
load = droplet.load()
ip_address = load.ip_address
status = DigitalOceanAPI.check_status(droplet)
print(status)
return ip_address, droplet
def get_author(soup):
question_container = GetCode.get_question(soup)
author, reputation, creation_date = GetAuthor.get_author(question_container)
return author, reputation, creation_date
def get_code(conn, soup, i, accepted, author, reputation, creation_date, tags, post_id):
question_container = GetCode.get_question(soup)
answer_container = GetCode.get_answers(soup)
question_pre = GetCode.get_pre(question_container)
answer_pre = GetCode.get_pre(answer_container)
question_code = GetCode.get_code(question_pre)
answer_code = GetCode.get_code(answer_pre)
GetCode.save_database(i, conn, post_id, author, reputation, creation_date, accepted, question_code, answer_code,
tags)
def get_tags(soup):
question = GetCode.get_question(soup)
tags = GetTags.get_tags(question)
separator = ', '
tags = separator.join(tags)
return tags
def get_accepted(soup):
answer_container = GetCode.get_answers(soup)
accepted_post = GetAccepted.accepted_id(answer_container)
accepted_author = GetAuthor.get_author(accepted_post)
return accepted_author
def request_loop(i, loop_length, post_url, proxies, db_connection, droplet, post_id, file_name, enable_proxies):
while i < loop_length:
time.sleep(1)
if enable_proxies:
try:
get(post_url[i], proxies=proxies)
except ProxyError:
request_loop(i, loop_length, post_url, proxies, db_connection, droplet, post_id, file_name,
enable_proxies)
print("Proxy Error")
response = get(post_url[i], proxies=proxies)
else:
response = get(post_url[i])
html_soup = BeautifulSoup(response.content, 'lxml')
if RequestTimeout.check_availability(response):
if enable_proxies:
DigitalOceanAPI.delete_droplet(droplet)
ip_address, droplet = create_droplet("proxy-droplet")
proxies = {
"http": "https://" + ip_address + ":3128",
"https": "https://" + ip_address + ":3128",
}
request_loop(i, loop_length, post_url, proxies, db_connection, droplet, post_id, file_name,
enable_proxies=True)
else:
time.sleep(300)
request_loop(i, loop_length, post_url, proxies, db_connection, droplet, post_id, file_name,
enable_proxies=False)
else:
accepted_post = get_accepted(html_soup)
author, reputation, creation_date = get_author(html_soup)
tags = get_tags(html_soup)
get_code(db_connection, html_soup, i, accepted_post, author, reputation, creation_date, tags, post_id[i])
i += 1
file = open(file_name + '.txt', 'w')
str_i = str(i)
file.write(str_i)
file.close()
return droplet
def main():
db_connection = DatabaseConnection.create_connection(
r"C:\Users\ba051652\OneDrive - Otto-Friedrich-Universität Bamberg\SS 20\Bachelorarbeit\Materialien\StackOverflow data dump\ru_database\stackoverflow_ru.db")
selected_csv = GetPost.select_csv()
post_id = GetPost.get_id(selected_csv)
post_url = GetPost.create_url(base_url, post_id)
file_name = UserSelection.save_name()
enable_proxies = DigitalOceanAPI.proxy_function()
if enable_proxies:
ip_address, droplet = create_droplet("proxy-droplet")
proxies = {
"http": "https://" + ip_address + ":3128",
"https": "https://" + ip_address + ":3128",
}
else:
proxies = None
droplet = None
i, end = GetPost.select_parameter(post_url)
# 1014 post without an author/ deleted account
droplet = request_loop(i, end, post_url, proxies, db_connection, droplet, post_id, file_name,
enable_proxies)
if droplet is not None:
DigitalOceanAPI.delete_droplet(droplet)
if __name__ == "__main__":
main()