-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_all_data_for_submission.py
174 lines (145 loc) · 10.5 KB
/
process_all_data_for_submission.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
# #
# __author__ = Adarsh Kalikadien #
# __institution__ = TU Delft #
# __contact__ = [email protected] #
"""process everything into a single solution.csv file for submission, see dataset/1.initial_datasets/sample_submission.csv
We need the following data: biomass_forecast for 2018 and 2019, depot_location and refinery_location for 2018/2019
(stays the same for both years), pellet_demand_supply and biomass_demand_supply for 2018 and 2019"""
import random
import pandas as pd
import numpy as np
import pickle
# function to process the biomass forecast df into a df with the required format
def process_biomass_forecast(df):
# ToDo: this function is now specific for 230809_RF_biomass_prediction.csv, change if needed e.g. when index is already in df as column
# unpivot 2018 and 2019 column such that we have the following columns:
# year data_type source_index destination_index value
# 2018 biomass_forecast index_from_df NaN biomass_forecast_value
# drop longitude and latitude columns
df = df.drop(columns=['Longitude', 'Latitude', 'Index'])
# unpivot 2018 and 2019 columns
df = pd.melt(df.reset_index(), id_vars=['index'], var_name='year', value_name='value')
# drop index column
df = df.drop(columns=['index'])
# convert year column to int
df['year'] = df['year'].astype(int)
# add data_type column
df['data_type'] = 'biomass_forecast'
# add source_index and destination_index columns
df['source_index'] = df.index
# set destination_index to NaN
df['destination_index'] = np.nan
return df
def process_flow_matrix(df, year, biomass_or_pellet):
# in the flow matrix the source_index is the index of the df and the destination_index is the column
# the value is then the flow from source_index to destination_index in the matrix
# year data_type source_index destination_index value
# 2018 {biomass/pellet}_demand_supply index_from_df column_from_df {biomass/pellet}_demand_supply_value
# Create an empty DataFrame to store the transformed data
transformed_df = pd.DataFrame(columns=['year', 'data_type', 'source_index', 'destination_index', 'value'])
# if df is a numpy array, convert to pandas df
if isinstance(df, np.ndarray):
df = pd.DataFrame(df)
transformed_data = []
for source_index in df.index:
for destination_index in df.columns:
if source_index != destination_index:
value = df.loc[source_index, destination_index]
if value >= 0: # we also want to include 0 values to adhere to constraint 1 of the problem
data_type = f"{biomass_or_pellet}_demand_supply"
transformed_data.append({'year': year, 'data_type': data_type, 'source_index': source_index,
'destination_index': destination_index, 'value': value})
transformed_df = pd.DataFrame(transformed_data)
# convert year column to int
transformed_df['year'] = transformed_df['year'].astype(int)
# if type is biomass and any of the biomass site indices (0 to 2417) is missing in source_index, just
# add it with value 0 and pick a random destination_index
if biomass_or_pellet == 'biomass':
for index in range(2418):
if index not in transformed_df['source_index'].unique():
transformed_df = pd.concat([transformed_df, pd.DataFrame({'year': year, 'data_type': 'biomass_demand_supply',
'source_index': index,
'destination_index': random.choice(transformed_df['destination_index'].unique()),
'value': 0}, index=[0])], ignore_index=True)
return transformed_df
def process_depot_biorefinery_location(list_of_location_indices, year, depot_or_biorefinery):
# year data_type source_index destination_index value
# 2018 {depot/biorefinery}_location index_from_list NaN NaN
# Create an empty DataFrame to store the transformed data
transformed_df = pd.DataFrame(columns=['year', 'data_type', 'source_index', 'destination_index', 'value'])
# iterate over list and add to transformed_df
for index in list_of_location_indices:
transformed_df = pd.concat([transformed_df, pd.DataFrame({'year': year, 'data_type': f"{depot_or_biorefinery}_location",
'source_index': index, 'destination_index': np.nan,
'value': np.nan}, index=[0])], ignore_index=True)
# convert year column to int
transformed_df['year'] = transformed_df['year'].astype(int)
return transformed_df
if __name__ == "__main__":
# create empty df to store all data
all_data_df = pd.DataFrame(columns=['year', 'data_type', 'source_index', 'destination_index', 'value'])
# biomass forecast processing
# read and process biomass forecast, write to csv file for example submission
biomass_forecast_df = pd.read_csv('dataset/3.predictions/20230826Biomass_Predictions.csv')
biomass_forecast_df = process_biomass_forecast(biomass_forecast_df)
# read biomass flows for 2018 and 2019
# for vivek's file
# biomass_flow_2018_df = pd.read_csv('dataset/3.predictions/LastTry2018_flow.csv')
# biomass_flow_2019_df = pd.read_csv('dataset/3.predictions/LastTry2019_flow.csv')
biomass_flow_2018_df = pd.read_csv('dataset/3.predictions/biomass_flow_2018.csv')
biomass_flow_2019_df = pd.read_csv('dataset/3.predictions/biomass_flow_2019.csv')
# read pellet flows for 2018 and 2019
pellet_flow_2018_df = pd.read_csv('dataset/3.predictions/pellet_flow_2018.csv')
pellet_flow_2019_df = pd.read_csv('dataset/3.predictions/pellet_flow_2019.csv')
# # for genetic algo we can get the flow matrix from the class itself (to make sure changes to processing functions work)
# # load pickle files for 2018 and 2019 genetic algorithm classes
# biomass_gen_algo_2018 = pickle.load(open('dataset/3.predictions/optimizer_2018_dpts_20_brfnrs_4_pop_200.pkl', 'rb'))
# biomass_flow_2018_df = process_flow_matrix(biomass_gen_algo_2018.flow_sites_to_depots, biomass_gen_algo_2018.year, 'biomass')
# # in biomass_flow_df the destination_index is now the depot's index, but this should be mapped to the correct
# # value from the list of self.depot_cluster_center_location_indices
# biomass_flow_2018_df['destination_index'] = biomass_gen_algo_2018.depot_cluster_center_location_indices[
# biomass_flow_2018_df['destination_index'].values]
# pellet_flow_2018_df = process_flow_matrix(biomass_gen_algo_2018.flow_depots_to_biorefineries, biomass_gen_algo_2018.year, 'pellet')
# # in pellet_flow_df the source_index is now the depot's index, but this should be mapped to the correct
# # value from the list of self.depot_cluster_center_location_indices
# pellet_flow_2018_df['destination_index'] = biomass_gen_algo_2018.refinery_cluster_center_location_indices[
# pellet_flow_2018_df['destination_index'].values]
# # and the source index should be mapped to the correct value from the list of self.depot_cluster_center_location_indices
# pellet_flow_2018_df['source_index'] = biomass_gen_algo_2018.depot_cluster_center_location_indices[
# pellet_flow_2018_df['source_index'].values]
# # do the same for 2019
# biomass_gen_algo_2019 = pickle.load(open('dataset/3.predictions/optimizer_2019_dpts_20_brfnrs_4_pop_200.pkl', 'rb'))
# biomass_flow_2019_df = process_flow_matrix(biomass_gen_algo_2019.flow_sites_to_depots, biomass_gen_algo_2019.year, 'biomass')
# # in biomass_flow_df the destination_index is now the depot's index, but this should be mapped to the correct
# # value from the list of self.depot_cluster_center_location_indices
# biomass_flow_2019_df['destination_index'] = biomass_gen_algo_2019.depot_cluster_center_location_indices[
# biomass_flow_2019_df['destination_index'].values]
# pellet_flow_2019_df = process_flow_matrix(biomass_gen_algo_2019.flow_depots_to_biorefineries, biomass_gen_algo_2019.year, 'pellet')
# # in pellet_flow_df the source_index is now the depot's index, but this should be mapped to the correct
# # value from the list of self.depot_cluster_center_location_indices
# pellet_flow_2019_df['destination_index'] = biomass_gen_algo_2019.refinery_cluster_center_location_indices[
# pellet_flow_2019_df['destination_index'].values]
# # and the source index should be mapped to the correct value from the list of self.depot_cluster_center_location_indices
# pellet_flow_2019_df['source_index'] = biomass_gen_algo_2019.depot_cluster_center_location_indices[
# pellet_flow_2019_df['source_index'].values]
# read depot and biorefinery locations (unique source and destination indices from pellet_flow_2018_df)
# for vivek's file we get this from the correct biomass flow matrix instead of pellet flow matrix
# depot_location_indices = biomass_flow_2018_df['destination_index'].unique()
depot_location_indices = pellet_flow_2018_df['source_index'].unique()
biorefinery_location_indices = pellet_flow_2018_df['destination_index'].unique()
# process locations
depot_location_df = process_depot_biorefinery_location(depot_location_indices, 20182019, 'depot')
biorefinery_location_df = process_depot_biorefinery_location(biorefinery_location_indices, 20182019, 'refinery')
# concat all dataframes in all_data_df
all_data_df = pd.concat([all_data_df, biomass_forecast_df, biomass_flow_2018_df, biomass_flow_2019_df,
pellet_flow_2018_df, pellet_flow_2019_df, depot_location_df, biorefinery_location_df])
# write to csv file
all_data_df.to_csv('dataset/3.predictions/submission.csv', index=False)
# biomass_forecast_df.to_csv('dataset/3.predictions/submission.csv', index=False)
# read example submission and concat biomass_forecast_df to bottom of df
# example_submission_df = pd.read_csv('dataset/1.initial_datasets/sample_submission.csv')
# example_submission_df = example_submission_df[example_submission_df['data_type'] != 'biomass_forecast']
# example_submission_df = pd.concat([example_submission_df, biomass_forecast_df])
# # write to csv file
# example_submission_df.to_csv('dataset/3.predictions/submission.csv', index=False)