-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathScraper.py
115 lines (91 loc) · 3.96 KB
/
Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
from util import Utils
from browser import choose_browser
class Scraper:
def __init__(self,file_path):
self.driver = None
self.file_path = file_path
self.helper = Utils(file_path)
def write_code_solution(self):
with open(self.file_path, 'r+') as fichier:
data = json.load(fichier)
treated_num = 0
for row in data:
problem_num = self.helper.get_number_of_problems()
self.driver.get(row['url'])
time.sleep(5)
code_container = self.driver.find_element(By.CSS_SELECTOR, "div.ace_layer.ace_text-layer")
code_lines = code_container.find_elements(By.CSS_SELECTOR, "div.ace_line_group")
code = ""
for line in code_lines:
code += line.text + "\n"
row['code'] = code.strip()
treated_num += 1
self.helper.display_progress_bar(problem_number=problem_num,problem_treated_num=treated_num)
fichier.seek(0)
json.dump(data, fichier, indent=4)
fichier.truncate()
def extract_problems(self, data_row):
problems = []
for data in data_row:
problem_name = self.helper.ExtractProblemName(data['line'])
language = self.helper.extract_language_name(data['line'])
if problem_name:
problems.append({'problem': problem_name, 'state': 'Accepted', 'url': data['url'],'language': language})
return problems
def get_problems(self, table_rows):
data_row = []
has_data = False
for row in table_rows:
cells = row.find_elements(By.TAG_NAME, "td")
for cell in cells:
if 'Accepted' in cell.text:
link = cell.find_element(By.TAG_NAME, 'a')
if link:
url = link.get_attribute('href')
data_row.append({'line': row.text, 'url': url})
has_data = True
break
return data_row, has_data
def Authenticate(self):
self.driver = choose_browser()
print("Please log in manually, then press Enter to continue...")
self.driver.get("https://leetcode.com/accounts/login/")
input("Please press Enter to continue once you are logged in...")
def extract_data(self):
if not self.driver:
return None
page_counter = 1
problems = []
with open(self.file_path, 'w') as fichier:
while True:
self.driver.get(f"https://leetcode.com/submissions/#/{page_counter}")
time.sleep(5)
table_rows = self.driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
if not table_rows:
print("No more rows found at page:", page_counter)
break
data_row, has_data = self.get_problems(table_rows)
if has_data:
print(f"Data found on page {page_counter}:")
problems.extend(self.extract_problems(data_row))
else:
print(f"No data at page {page_counter}")
break
page_counter += 1
json.dump(problems, fichier, indent=4)
# After writing on file, remove duplicates
self.helper.remove_duplicates_from_json()
# Getting code solutions from submissions :
self.write_code_solution()
# Then create directories for each problem
self.helper.create_folders()
if self.driver:
self.driver.quit()
if __name__ == "__main__":
scraper = Scraper('./problems.json')
scraper.Authenticate()
scraper.extract_data()