Skip to content

Commit

Permalink
feat(aggregation-api): add cluster column for details-XXX query f…
Browse files Browse the repository at this point in the history
…iles
  • Loading branch information
mabw-rte committed Aug 20, 2024
1 parent 4eccd53 commit f2ee960
Show file tree
Hide file tree
Showing 17 changed files with 33,802 additions and 4,146 deletions.
142 changes: 103 additions & 39 deletions antarest/study/business/aggregator_management.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,17 @@
"""Column name for the time index."""
TIME_COL = "time"
"""Column name for the timestamp."""
CLUSTER_ID_COL = "cluster"
"""Column name for the cluster id."""
MC_YEAR_INDEX = 0
"""Index in path parts starting from the Monte Carlo year to determine the Monte Carlo year."""
AREA_OR_LINK_INDEX__IND, AREA_OR_LINK_INDEX__ALL = 2, 1
"""Indexes in path parts starting from the output root `economy//mc-(ind/all)` to determine the area/link name."""
PRODUCTION_COLUMN_NAME = "production"
PRODUCTION_COLUMN_REGEX = "mwh"
CLUSTER_ID_COMPONENT = 0
ACTUAL_COLUMN_COMPONENT = 1
DUMMY_COMPONENT = 2

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,6 +78,42 @@ def _checks_estimated_size(nb_files: int, df_bytes_size: int, nb_files_checked:
raise FileTooLargeError(estimated_df_size, maximum_size)


def _columns_ordering(df_cols: t.List[str], column_name: str, is_details: bool, mc_root: MCRoot) -> t.Sequence[str]:
org_cols = df_cols.copy()
if mc_root == MCRoot.MC_ALL:
org_cols = [
col for col in org_cols if col not in {column_name, CLUSTER_ID_COL, MCYEAR_COL, TIME_ID_COL, TIME_COL}
]
if is_details:
org_cols = [col for col in org_cols if col != CLUSTER_ID_COL and col != TIME_ID_COL]
if mc_root == MCRoot.MC_IND:
new_column_order = (
[column_name] + ([CLUSTER_ID_COL] if is_details else []) + [MCYEAR_COL, TIME_ID_COL, TIME_COL] + org_cols
)
elif mc_root == MCRoot.MC_ALL:
new_column_order = [column_name] + ([CLUSTER_ID_COL] if is_details else []) + [TIME_ID_COL, TIME_COL] + org_cols
else:
raise NotImplementedError(f"Unknown Monte Carlo root: {mc_root}")

return new_column_order


def _infer_production_column(cols: t.Sequence[str]) -> t.Optional[str]:
prod_col = None
for c in cols:
if PRODUCTION_COLUMN_REGEX in c.lower().strip():
prod_col = c
break
return prod_col


def _infer_time_id(df: pd.DataFrame, is_details: bool) -> t.List[int]:
if is_details:
return df[TIME_ID_COL].tolist()
else:
return list(range(1, len(df) + 1))


class AggregatorManager:
def __init__(
self,
Expand Down Expand Up @@ -108,7 +151,7 @@ def __init__(
else MCRoot.MC_ALL
)

def _parse_output_file(self, file_path: Path) -> pd.DataFrame:
def _parse_output_file(self, file_path: Path, normalize: bool = True) -> pd.DataFrame:
csv_file = pd.read_csv(
file_path,
sep="\t",
Expand All @@ -121,10 +164,17 @@ def _parse_output_file(self, file_path: Path) -> pd.DataFrame:
date, body = date_serializer.extract_date(csv_file)
df = rename_unnamed(body).astype(float)

if not normalize:
df.index = date
return df

# normalize columns names
new_cols = []
for col in body.columns:
name_to_consider = col[0] if self.query_file.value == MCIndAreasQueryFile.VALUES else " ".join(col)
if self.mc_root == MCRoot.MC_IND:
name_to_consider = col[0] if self.query_file.value == MCIndAreasQueryFile.VALUES else " ".join(col)
else:
name_to_consider = " ".join([col[0], col[2]])
new_cols.append(name_to_consider.upper().strip())

df.index = date
Expand Down Expand Up @@ -202,6 +252,10 @@ def columns_filtering(self, df: pd.DataFrame, is_details: bool) -> pd.DataFrame:
lower_case_columns = [c.lower() for c in self.columns_names]
if lower_case_columns:
if is_details:
filtered_columns = [CLUSTER_ID_COL, TIME_ID_COL] + [
c for c in df.columns.tolist() if any(regex in c.lower() for regex in lower_case_columns)
]
elif self.mc_root == MCRoot.MC_ALL:
filtered_columns = [
c for c in df.columns.tolist() if any(regex in c.lower() for regex in lower_case_columns)
]
Expand All @@ -210,7 +264,41 @@ def columns_filtering(self, df: pd.DataFrame, is_details: bool) -> pd.DataFrame:
df = df.loc[:, filtered_columns]
return df

def _process_df(self, file_path: Path, is_details: bool) -> pd.DataFrame:
if is_details:
un_normalized_df = self._parse_output_file(file_path, normalize=False)
df_len = len(un_normalized_df)
cluster_dummy_product_cols = sorted(
set([(x[CLUSTER_ID_COMPONENT], x[DUMMY_COMPONENT]) for x in un_normalized_df.columns])
)
actual_cols = sorted(set(un_normalized_df.columns.map(lambda x: x[ACTUAL_COLUMN_COMPONENT])))
new_obj: t.Dict[str, t.Any] = {k: [] for k in actual_cols}
new_obj[CLUSTER_ID_COL] = []
new_obj[TIME_ID_COL] = []
for cluster_id, dummy_component in cluster_dummy_product_cols:
for actual_col in actual_cols:
col_values = un_normalized_df[(cluster_id, actual_col, dummy_component)].tolist() # type: ignore
new_obj[actual_col] += col_values
new_obj[CLUSTER_ID_COL] += [cluster_id for _ in range(df_len)]
new_obj[TIME_ID_COL] += list(range(1, df_len + 1))

prod_col = _infer_production_column(actual_cols)
if prod_col is not None:
new_obj[PRODUCTION_COLUMN_NAME] = new_obj.pop(prod_col)
actual_cols.remove(prod_col)

add_prod = [PRODUCTION_COLUMN_NAME] if prod_col is not None else []
columns_order = [CLUSTER_ID_COL, TIME_ID_COL] + add_prod + list(actual_cols)
df = pd.DataFrame(new_obj).reindex(columns=columns_order).sort_values(by=[TIME_ID_COL, CLUSTER_ID_COL])
df.index = pd.Index(list(range(1, len(df) + 1)))

return df

else:
return self._parse_output_file(file_path)

def _build_dataframe(self, files: t.Sequence[Path], horizon: int) -> pd.DataFrame:
assert self.mc_root in [MCRoot.MC_IND, MCRoot.MC_ALL], f"Unknown Monte Carlo root: {self.mc_root}"
is_details = self.query_file in [
MCIndAreasQueryFile.DETAILS,
MCAllAreasQueryFile.DETAILS,
Expand All @@ -222,14 +310,14 @@ def _build_dataframe(self, files: t.Sequence[Path], horizon: int) -> pd.DataFram
final_df = pd.DataFrame()
nb_files = len(files)
for k, file_path in enumerate(files):
df = self._parse_output_file(file_path)
df = self._process_df(file_path, is_details)

# columns filtering
df = self.columns_filtering(df, is_details)

# if no columns, no need to continue
list_of_df_columns = df.columns.tolist()
if not list_of_df_columns:
if not list_of_df_columns or set(list_of_df_columns) == {CLUSTER_ID_COL, TIME_ID_COL}:
return pd.DataFrame()

# checks if the estimated dataframe size does not exceed the limit
Expand All @@ -239,50 +327,26 @@ def _build_dataframe(self, files: t.Sequence[Path], horizon: int) -> pd.DataFram
estimated_binary_size = final_df.memory_usage().sum()
_checks_estimated_size(nb_files, estimated_binary_size, k)

column_name = AREA_COL if self.output_type == "areas" else LINK_COL
new_column_order = _columns_ordering(list_of_df_columns, column_name, is_details, self.mc_root)

if self.mc_root == MCRoot.MC_IND:
# add column for links/areas
relative_path_parts = file_path.relative_to(self.mc_ind_path).parts
column_name = AREA_COL if self.output_type == "areas" else LINK_COL
new_column_order = [column_name, MCYEAR_COL, TIME_ID_COL, TIME_COL] + list_of_df_columns
df[column_name] = relative_path_parts[AREA_OR_LINK_INDEX__IND]

# add column to record the Monte Carlo year
df[MCYEAR_COL] = int(relative_path_parts[MC_YEAR_INDEX])

# add a column for the time id
df[TIME_ID_COL] = list(range(1, len(df) + 1))
# add horizon column
df[TIME_COL] = horizon

# Reorganize the columns
df = df.reindex(columns=new_column_order)
else:
# first columns df
# first we include the time id column
first_columns_df = pd.DataFrame(
{
TIME_ID_COL: list(range(1, len(df) + 1)),
}
)

# add column for links/areas
relative_path_parts = file_path.relative_to(self.mc_all_path).parts
column_name = AREA_COL if self.output_type == "areas" else LINK_COL
first_columns_df[column_name] = relative_path_parts[AREA_OR_LINK_INDEX__ALL]

# add horizon column
first_columns_df[TIME_COL] = horizon

# reorder first columns
new_column_order = [column_name, TIME_ID_COL, TIME_COL]
first_columns_df = first_columns_df.reindex(columns=new_column_order)

# reset index
# noinspection PyTypeChecker
first_columns_df.set_index(df.index, inplace=True)

# merge first columns with the rest of the df
df = pd.merge(first_columns_df, df, left_index=True, right_index=True)
df[column_name] = relative_path_parts[AREA_OR_LINK_INDEX__ALL]

# add a column for the time id
df[TIME_ID_COL] = _infer_time_id(df, is_details)
# add horizon column
df[TIME_COL] = horizon
# Reorganize the columns
df = df.reindex(columns=pd.Index(new_column_order))

final_df = pd.concat([final_df, df], ignore_index=True)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
area timeId time OV. COST OP. COST OP. COST.1 OP. COST.2 OP. COST.3 MRG. PRICE MRG. PRICE.1 MRG. PRICE.2 MRG. PRICE.3 CO2 EMIS. BALANCE BALANCE.1 BALANCE.2 BALANCE.3 ROW BAL. PSP MISC. NDG LOAD LOAD.1 LOAD.2 LOAD.3 H. ROR H. ROR.1 H. ROR.2 H. ROR.3 WIND WIND.1 WIND.2 WIND.3 SOLAR SOLAR.1 SOLAR.2 SOLAR.3 NUCLEAR NUCLEAR.1 NUCLEAR.2 NUCLEAR.3 LIGNITE LIGNITE.1 LIGNITE.2 LIGNITE.3 COAL COAL.1 COAL.2 COAL.3 GAS GAS.1 GAS.2 GAS.3 OIL OIL.1 OIL.2 OIL.3 MIX. FUEL MIX. FUEL.1 MIX. FUEL.2 MIX. FUEL.3 MISC. DTG MISC. DTG.1 MISC. DTG.2 MISC. DTG.3 H. STOR H. STOR.1 H. STOR.2 H. STOR.3 H. PUMP H. PUMP.1 H. PUMP.2 H. PUMP.3 H. LEV H. LEV.1 H. LEV.2 H. LEV.3 H. INFL H. INFL.1 H. INFL.2 H. INFL.3 H. OVFL H. OVFL.1 H. OVFL.2 H. OVFL.3 H. VAL H. VAL.1 H. VAL.2 H. VAL.3 H. COST H. COST.1 H. COST.2 H. COST.3 UNSP. ENRG UNSP. ENRG.1 UNSP. ENRG.2 UNSP. ENRG.3 SPIL. ENRG SPIL. ENRG.1 SPIL. ENRG.2 SPIL. ENRG.3 LOLD LOLD.1 LOLD.2 LOLD.3 LOLP AVL DTG AVL DTG.1 AVL DTG.2 AVL DTG.3 DTG MRG DTG MRG.1 DTG MRG.2 DTG MRG.3 MAX MRG MAX MRG.1 MAX MRG.2 MAX MRG.3 NP COST NP COST.1 NP COST.2 NP COST.3 NODU NODU.1 NODU.2 NODU.3
area timeId time OV. COST EXP OP. COST EXP OP. COST STD OP. COST MIN OP. COST MAX MRG. PRICE EXP MRG. PRICE STD MRG. PRICE MIN MRG. PRICE MAX CO2 EMIS. EXP BALANCE EXP BALANCE STD BALANCE MIN BALANCE MAX ROW BAL. VALUES PSP EXP MISC. NDG EXP LOAD EXP LOAD STD LOAD MIN LOAD MAX H. ROR EXP H. ROR STD H. ROR MIN H. ROR MAX WIND EXP WIND STD WIND MIN WIND MAX SOLAR EXP SOLAR STD SOLAR MIN SOLAR MAX NUCLEAR EXP NUCLEAR STD NUCLEAR MIN NUCLEAR MAX LIGNITE EXP LIGNITE STD LIGNITE MIN LIGNITE MAX COAL EXP COAL STD COAL MIN COAL MAX GAS EXP GAS STD GAS MIN GAS MAX OIL EXP OIL STD OIL MIN OIL MAX MIX. FUEL EXP MIX. FUEL STD MIX. FUEL MIN MIX. FUEL MAX MISC. DTG EXP MISC. DTG STD MISC. DTG MIN MISC. DTG MAX H. STOR EXP H. STOR STD H. STOR MIN H. STOR MAX H. PUMP EXP H. PUMP STD H. PUMP MIN H. PUMP MAX H. LEV EXP H. LEV STD H. LEV MIN H. LEV MAX H. INFL EXP H. INFL STD H. INFL MIN H. INFL MAX H. OVFL EXP H. OVFL STD H. OVFL MIN H. OVFL MAX H. VAL EXP H. VAL STD H. VAL MIN H. VAL MAX H. COST EXP H. COST STD H. COST MIN H. COST MAX UNSP. ENRG EXP UNSP. ENRG STD UNSP. ENRG MIN UNSP. ENRG MAX SPIL. ENRG EXP SPIL. ENRG STD SPIL. ENRG MIN SPIL. ENRG MAX LOLD EXP LOLD STD LOLD MIN LOLD MAX LOLP VALUES AVL DTG EXP AVL DTG STD AVL DTG MIN AVL DTG MAX DTG MRG EXP DTG MRG STD DTG MRG MIN DTG MRG MAX MAX MRG EXP MAX MRG STD MAX MRG MIN MAX MRG MAX NP COST EXP NP COST STD NP COST MIN NP COST MAX NODU EXP NODU STD NODU MIN NODU MAX
de 1 2030 282000.0 282000.0 0.0 282000.0 282000.0 11.66 0.0 11.66 11.66 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 27600.0 0.0 27600.0 27600.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 27600.0 0.0 27600.0 27600.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 432000.0 0.0 432000.0 432000.0 404400.0 0.0 404400.0 404400.0 404400.0 0.0 404400.0 404400.0 0.0 0.0 0.0 0.0 26.0 0.0 26.0 26.0
de 2 2030 1252000.0 1252000.0 0.0 1252000.0 1252000.0 23.33 0.0 23.33 23.33 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 85200.0 0.0 85200.0 85200.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 85200.0 0.0 85200.0 85200.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 432000.0 0.0 432000.0 432000.0 346800.0 0.0 346800.0 346800.0 346800.0 0.0 346800.0 346800.0 0.0 0.0 0.0 0.0 55.0 0.0 55.0 55.0
de 3 2030 2910000.0 2910000.0 0.0 2910000.0 2910000.0 35.0 0.0 35.0 35.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 142800.0 0.0 142800.0 142800.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 142800.0 0.0 142800.0 142800.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 432000.0 0.0 432000.0 432000.0 289200.0 0.0 289200.0 289200.0 289200.0 0.0 289200.0 289200.0 0.0 0.0 0.0 0.0 83.0 0.0 83.0 83.0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
area timeId time 01_SOLAR MWH EXP 02_WIND_ON MWH EXP 03_WIND_OFF MWH EXP 04_RES MWH EXP 05_NUCLEAR MWH EXP 06_COAL MWH EXP 07_GAS MWH EXP 08_NON-RES MWH EXP 09_HYDRO_PUMP MWH EXP 01_SOLAR NP COST - EURO EXP 02_WIND_ON NP COST - EURO EXP 03_WIND_OFF NP COST - EURO EXP 04_RES NP COST - EURO EXP 05_NUCLEAR NP COST - EURO EXP 06_COAL NP COST - EURO EXP 07_GAS NP COST - EURO EXP 08_NON-RES NP COST - EURO EXP 09_HYDRO_PUMP NP COST - EURO EXP 01_SOLAR NODU EXP 02_WIND_ON NODU EXP 03_WIND_OFF NODU EXP 04_RES NODU EXP 05_NUCLEAR NODU EXP 06_COAL NODU EXP 07_GAS NODU EXP 08_NON-RES NODU EXP 09_HYDRO_PUMP NODU EXP
de 1 2030 315000.0 275000.0 235000.0 195000.0 155000.0 115000.0 75000.0 35000.0 2800.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 167.0 147.0 127.0 107.0 87.0 67.0 47.0 27.0 7.0
area cluster timeId time production NODU NP Cost - Euro
de 01_solar 1 2030 315000.0 167.0 0.0
de 02_wind_on 1 2030 275000.0 147.0 0.0
de 03_wind_off 1 2030 235000.0 127.0 0.0
de 04_res 1 2030 195000.0 107.0 0.0
de 05_nuclear 1 2030 155000.0 87.0 0.0
de 06_coal 1 2030 115000.0 67.0 0.0
de 07_gas 1 2030 75000.0 47.0 0.0
de 08_non-res 1 2030 35000.0 27.0 0.0
de 09_hydro_pump 1 2030 2800.0 7.0 0.0
Loading

0 comments on commit f2ee960

Please sign in to comment.