-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
78 lines (70 loc) · 2.76 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# %% Import sql library
import sqlalchemy, pandas as pd, numpy as np, matplotlib.pyplot as plt, os, sys, yaml, time
from sqlalchemy import create_engine
from yaml import safe_load
# %% Make a connection to the database
info = safe_load(open(os.path.join(os.getcwd(), 'credentials.yaml')))
for key in info:
# save each key as a variable
globals()[str(key)] = info[key]
# Create the connection string
connection_string = f'postgresql://{user}:{password}@{host}:{port}/{database}'
# Create the engine
engine = create_engine(connection_string)
# %% Define the query
view = 'public.view_basis_hourly_na_timeobs'
view_columns = ['label', 'statid', 'timeobs', 'elem_no', 'elem_val']
label = [2100, 10002100, 300002100, 310002100]
elem_no = [101, 123, 113, 201, 301, 371, 365, 401, 603, 801]
timeobs_start = '1958-01-01 00:00'
timeobs_end = '2023-01-01 00:00'
def every_year(start, end, order='ASC'):
start_year = pd.to_datetime(start).year
end_year = pd.to_datetime(end).year
if order == 'ASC':
return [f'{year}-01-01 00:00' for year in range(start_year, end_year+1)]
elif order == 'DESC':
return [f'{year}-01-01 00:00' for year in range(end_year, start_year-1, -1)]
else:
raise ValueError('order must be either ASC or DESC')
start = pd.to_datetime(timeobs_start)
end = pd.to_datetime(timeobs_end)
every_year = every_year(start, end)
time_very_start = time.time()
# create cirectory to save the data
directory = os.path.join(os.getcwd(), 'extractions')
if not os.path.exists(directory):
os.makedirs(directory)
for year in every_year[:-1]:
year_int = int(year.split('-')[0])
next_year = year_int + 1
print(f'Working on {year_int} to {next_year}')
time_start = time.time()
sql_query = f"""
SELECT {', '.join(view_columns)}
FROM {view}
WHERE label IN {tuple(label)}
AND elem_no IN {tuple(elem_no)}
AND timeobs >= '{year_int}-01-01 00:00'
AND timeobs < '{next_year}-01-01 00:00';
"""
connection = engine.connect()
print(sql_query)
# Execute the query and store in numpy array
df = pd.read_sql_query(sql_query, con = connection, parse_dates=['timeobs'])
# Close the connection
connection.close()
# Save the data frame with the year as the name
df.to_csv(os.path.join(directory, f'{year_int}.csv'), index=False)
time_end = time.time()
print(f'Time taken: {time_end - time_start} seconds')
time_very_end = time.time()
print(f'Total time taken: {time_very_end - time_very_start} seconds')
# %%
# one dir back
directory = os.path.join(os.path.dirname(os.getcwd()), 'extractions')
print(directory)
# %%
# Combine all the csv files into one
df = pd.concat([pd.read_csv(os.path.join(directory, file)) for file in os.listdir(directory)])
df.to_csv(os.path.join(directory, 'all.csv'), index=False)