-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrefresher.py
135 lines (101 loc) · 3.95 KB
/
refresher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
URL = "https://www.travelmate.com.br/empregadores-worktravel/"
DATA_PATH = ""
WAIT_SECONDS = 15
def write_on_disk(file_to_save, data_path=DATA_PATH):
try:
with open(data_path + 'site-data.html', 'w', encoding='utf-8-sig') as file:
file.write(file_to_save)
print("Data Saved!")
except Exception as e:
print("Couldn't save data!")
print(e)
def fetch_page(site_url=URL, sleep_time=WAIT_SECONDS):
print("Will fetch data from: " + site_url)
service = Service(executable_path=r'F:\Portable Programs\geckodriver\geckodriver.exe')
driver = webdriver.Firefox(service=service)
driver.get(site_url)
print("Waiting until page loading is finished")
time.sleep(sleep_time)
print("Fetching data...")
webpage = driver.page_source
driver.quit()
return webpage
def load_file(data_path=DATA_PATH):
loaded_file = None
if data_path:
print("Loading data from: " + data_path)
try:
with open(data_path + 'site-data.html', 'r', encoding='utf-8-sig') as file:
loaded_file = file.read()
print("Loaded!")
except FileNotFoundError:
loaded_file = fetch_page()
write_on_disk(loaded_file)
except Exception as e:
print(e)
return loaded_file
def load_csv(file_name="out"):
try:
file_name += '.csv'
return pd.read_csv(DATA_PATH + file_name)
except FileNotFoundError:
print("Arquivo [" + file_name + "] nao encontrado!")
print("Um novo arquivo sera criado!")
def write_csv(dataframe, name='out'):
# Grava csv com nome dado ou out.csv por padrao
filename = name + '.csv'
dataframe.to_csv(DATA_PATH + filename)
print("Arquivo salvo com nome " + filename)
def main():
output_data = load_csv()
html = load_file()
if html:
soup = BeautifulSoup(html, features="html.parser")
results = soup.find_all("div", {"class": "sessao-estado"})
estado = []
for result in results:
# print(result.prettify(), end="\n")
state_sigle = result.get('id')[7:]
# estado[state_sigle] = []
# print(state_sigle)
for i in result.find_all('a', class_="link-verde"):
link = i.get('href')
title_start = link.rfind("/") + 1
title_end = link.rfind(".")
title = link[title_start:title_end]
# print(title)
now = datetime.today().strftime('%Y-%m-%d %H:%M')
estado.append((state_sigle, title, now, link, 0))
# employer_cities = result.find_all('p', class_="cidade-empregador")
# positions = []
# for employer in result.find_all("div", class_="tm-card-custom-list-requisitos"):
# specific_info = employer.find_all("li")
# positions.append(specific_info[1])
# print(specific_info[1])
# city_counter = 0
# employer_dict = {}
# for employer in result.find_all('h2'):
# employer_location = employer.string + " - " + employer_cities[city_counter].contents[0]
# city_counter += 1
# employer_dict[employer_location] = city_counter
# estado[state_sigle] = employer_dict
opened_file = output_data.link.to_list()
new_data = pd.DataFrame(estado, columns=['estado', 'titulo', 'data_add', 'link', 'status'])
downloaded_data = new_data.link.to_list()
diff = False
print("Empregadores atualizados:")
for link in downloaded_data:
if link not in opened_file:
diff = True
print(link)
if diff:
write_csv(new_data)
else:
print("Sem atualização de empregadores!")
main()