-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #210 from ResidenciaTICBrisa/186-script-dados-iea
186 script dados iea
- Loading branch information
Showing
7 changed files
with
687 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ jobs: | |
pip install bs4==0.0.1 | ||
pip install openpyxl==3.1.2 | ||
- name: execute py script # run main.py | ||
run: python 04_PipelineTCU/main.py | ||
run: python 04_PipelineTCU/scripts/main.py | ||
- name: commit files | ||
run: | | ||
git config --local user.email "[email protected]" | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import requests | ||
import re | ||
from bs4 import BeautifulSoup | ||
from pathlib import Path | ||
|
||
|
||
class ExcelScrapperIEA: | ||
def __init__(self, url, path_raiz): | ||
self.url = url | ||
self.nome_arquivo = None | ||
self.path_raiz = path_raiz | ||
|
||
self.download_link = None | ||
self.diretorio = '/scripts/constants/IEA/' | ||
self.path_destino = None | ||
def baixa_arquivo(self) -> str: | ||
response = requests.get(self.url) | ||
# Verifica se a requisição foi bem-sucedida (código 200) | ||
if response.status_code == 200: | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
download_button = soup.find('a', attrs={'download': True, 'href': True}) | ||
self.download_link = download_button['href'] | ||
self.nome_arquivo = re.search(r'[^/]+$', self.download_link).group(0) | ||
else: | ||
raise ValueError(f"Não foi possivel acessar a url {self.url} \n") | ||
|
||
if self.download_link: | ||
download_response = requests.get(self.download_link) | ||
if download_response.status_code == 200: | ||
self.path_destino = Path(self.path_raiz + self.diretorio + self.nome_arquivo) | ||
self.path_destino.parent.mkdir(parents=True, exist_ok=True) | ||
with open(str(self.path_destino), 'wb') as file: | ||
file.write(download_response.content) | ||
return self.nome_arquivo | ||
else: | ||
raise ValueError(f"Não foi possivel baixar o arquivo {self.nome_arquivo} \n") | ||
else: | ||
raise ValueError(f"Não existe link para download \n") |
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
import pandas as pd | ||
from pathlib import Path | ||
|
||
|
||
class IeaHandler: | ||
""" | ||
Classe responsável por ler um arquivo correspondente ao Energy Statics Data Browser do IEA, seleciona e salva | ||
as tabelas desejadas e salva em um arquivo csv | ||
Atributos: | ||
file (str): nome do arquivo que será lido | ||
n_tabelas (str): número de tabelas que o arquivo conseguiu criar, inicialmente é zero | ||
path (str): caminho até a pasta onde os arquivos serão salvos | ||
paises (str): lista de paises que estão na base de dados da IEA | ||
""" | ||
def __init__(self, file) -> None: | ||
""" | ||
:param file: | ||
""" | ||
self.file = file | ||
self.n_table = 0 | ||
self.tables = [] | ||
self.path = str(Path(__file__).parent.resolve()) | ||
self.paises = [ | ||
"Argentina", | ||
"Australia", | ||
"Austria", | ||
"Belgium", | ||
"Brazil", | ||
"Canada", | ||
"Chile", | ||
"China", | ||
"Colombia", | ||
"Czech Republic", | ||
"Denmark", | ||
"Egypt", | ||
"Estonia", | ||
"Finland", | ||
"France", | ||
"Germany", | ||
"Greece", | ||
"Hungary", | ||
"India", | ||
"Indonesia", | ||
"Ireland", | ||
"Israel", | ||
"Italy", | ||
"Japan", | ||
"Kenya", | ||
"Korea", | ||
"Latvia", | ||
"Lithuania", | ||
"Luxembourg", | ||
"Mexico", | ||
"Morocco", | ||
"New Zealand", | ||
"Norway", | ||
"Poland", | ||
"Portugal", | ||
"Senegal", | ||
"Singapore", | ||
"Slovak Republic", | ||
"South Africa", | ||
"Spain", | ||
"Sweden", | ||
"Switzerland", | ||
"The Netherlands", | ||
"Thailand", | ||
"Türkiye", | ||
"Ukraine", | ||
"United Kingdom", | ||
"United States" | ||
] | ||
# TODO: Será necessário fazer métodos que irão ler o arquivo xlsx e direcionar para a página certa da planilha | ||
# TODO: Será necessário fazer um método que, ao ler o arquivo xlsx, selecionará as linhas que possuem a coluna de flow igual a Total energy supply (PJ) e a coluna de country correspondente aos países citados na lista de paises | ||
|
||
def formatar_xlsx_IEA(self) -> None: | ||
""" | ||
Função responsável por ler o arquivo excel, selecionar o sheet específico e salvar apenas as informações | ||
importantes em um novo arquivo do tipo csv. | ||
:param nome_tab: | ||
:return: bool | ||
""" | ||
excel_file = pd.ExcelFile(self.path + "/constants/IEA/" + self.file) | ||
df_iea_sheet_principal = pd.read_excel(excel_file, sheet_name=3) | ||
lista_indices_linhas = df_iea_sheet_principal.index[df_iea_sheet_principal.iloc[:,2] == 'Total energy supply ' \ | ||
'(PJ)'].tolist() | ||
df_novo_csv = df_iea_sheet_principal.iloc[lista_indices_linhas] | ||
nome_colunas_novas = ['PAIS','PRODUTO','DADO_TIPO'] | ||
df_novo_csv.columns.values[:3] = nome_colunas_novas | ||
|
||
df_novo_csv = df_novo_csv.drop(df_novo_csv.columns[3:6], axis=1) | ||
|
||
df_novo_csv.columns.values[3:] = df_iea_sheet_principal.iloc[0,6:] | ||
|
||
df_novo_csv.replace('..', 0,inplace=True) | ||
|
||
df_novo_csv.to_csv(self.path + "/constants/IEA/" + "tabelas_paises_TES.csv", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters