feat(aggregation-api): add cluster column for details-XXX query f…

…iles
AntaresSimulatorTeam · Aug 20, 2024 · f2ee960 · f2ee960
1 parent 4eccd53
commit f2ee960
Show file tree

Hide file tree

Showing 17 changed files with 33,802 additions and 4,146 deletions.
diff --git a/antarest/study/business/aggregator_management.py b/antarest/study/business/aggregator_management.py
@@ -27,10 +27,17 @@
 """Column name for the time index."""
 TIME_COL = "time"
 """Column name for the timestamp."""
+CLUSTER_ID_COL = "cluster"
+"""Column name for the cluster id."""
 MC_YEAR_INDEX = 0
 """Index in path parts starting from the Monte Carlo year to determine the Monte Carlo year."""
 AREA_OR_LINK_INDEX__IND, AREA_OR_LINK_INDEX__ALL = 2, 1
 """Indexes in path parts starting from the output root `economy//mc-(ind/all)` to determine the area/link name."""
+PRODUCTION_COLUMN_NAME = "production"
+PRODUCTION_COLUMN_REGEX = "mwh"
+CLUSTER_ID_COMPONENT = 0
+ACTUAL_COLUMN_COMPONENT = 1
+DUMMY_COMPONENT = 2
 
 logger = logging.getLogger(__name__)
 
@@ -71,6 +78,42 @@ def _checks_estimated_size(nb_files: int, df_bytes_size: int, nb_files_checked:
         raise FileTooLargeError(estimated_df_size, maximum_size)
 
 
+def _columns_ordering(df_cols: t.List[str], column_name: str, is_details: bool, mc_root: MCRoot) -> t.Sequence[str]:
+    org_cols = df_cols.copy()
+    if mc_root == MCRoot.MC_ALL:
+        org_cols = [
+            col for col in org_cols if col not in {column_name, CLUSTER_ID_COL, MCYEAR_COL, TIME_ID_COL, TIME_COL}
+        ]
+    if is_details:
+        org_cols = [col for col in org_cols if col != CLUSTER_ID_COL and col != TIME_ID_COL]
+    if mc_root == MCRoot.MC_IND:
+        new_column_order = (
+            [column_name] + ([CLUSTER_ID_COL] if is_details else []) + [MCYEAR_COL, TIME_ID_COL, TIME_COL] + org_cols
+        )
+    elif mc_root == MCRoot.MC_ALL:
+        new_column_order = [column_name] + ([CLUSTER_ID_COL] if is_details else []) + [TIME_ID_COL, TIME_COL] + org_cols
+    else:
+        raise NotImplementedError(f"Unknown Monte Carlo root: {mc_root}")
+
+    return new_column_order
+
+
+def _infer_production_column(cols: t.Sequence[str]) -> t.Optional[str]:
+    prod_col = None
+    for c in cols:
+        if PRODUCTION_COLUMN_REGEX in c.lower().strip():
+            prod_col = c
+            break
+    return prod_col
+
+
+def _infer_time_id(df: pd.DataFrame, is_details: bool) -> t.List[int]:
+    if is_details:
+        return df[TIME_ID_COL].tolist()
+    else:
+        return list(range(1, len(df) + 1))
+
+
 class AggregatorManager:
     def __init__(
         self,
@@ -108,7 +151,7 @@ def __init__(
             else MCRoot.MC_ALL
         )
 
-    def _parse_output_file(self, file_path: Path) -> pd.DataFrame:
+    def _parse_output_file(self, file_path: Path, normalize: bool = True) -> pd.DataFrame:
         csv_file = pd.read_csv(
             file_path,
             sep="\t",
@@ -121,10 +164,17 @@ def _parse_output_file(self, file_path: Path) -> pd.DataFrame:
         date, body = date_serializer.extract_date(csv_file)
         df = rename_unnamed(body).astype(float)
 
+        if not normalize:
+            df.index = date
+            return df
+
         # normalize columns names
         new_cols = []
         for col in body.columns:
-            name_to_consider = col[0] if self.query_file.value == MCIndAreasQueryFile.VALUES else " ".join(col)
+            if self.mc_root == MCRoot.MC_IND:
+                name_to_consider = col[0] if self.query_file.value == MCIndAreasQueryFile.VALUES else " ".join(col)
+            else:
+                name_to_consider = " ".join([col[0], col[2]])
             new_cols.append(name_to_consider.upper().strip())
 
         df.index = date
@@ -202,6 +252,10 @@ def columns_filtering(self, df: pd.DataFrame, is_details: bool) -> pd.DataFrame:
         lower_case_columns = [c.lower() for c in self.columns_names]
         if lower_case_columns:
             if is_details:
+                filtered_columns = [CLUSTER_ID_COL, TIME_ID_COL] + [
+                    c for c in df.columns.tolist() if any(regex in c.lower() for regex in lower_case_columns)
+                ]
+            elif self.mc_root == MCRoot.MC_ALL:
                 filtered_columns = [
                     c for c in df.columns.tolist() if any(regex in c.lower() for regex in lower_case_columns)
                 ]
@@ -210,7 +264,41 @@ def columns_filtering(self, df: pd.DataFrame, is_details: bool) -> pd.DataFrame:
             df = df.loc[:, filtered_columns]
         return df
 
+    def _process_df(self, file_path: Path, is_details: bool) -> pd.DataFrame:
+        if is_details:
+            un_normalized_df = self._parse_output_file(file_path, normalize=False)
+            df_len = len(un_normalized_df)
+            cluster_dummy_product_cols = sorted(
+                set([(x[CLUSTER_ID_COMPONENT], x[DUMMY_COMPONENT]) for x in un_normalized_df.columns])
+            )
+            actual_cols = sorted(set(un_normalized_df.columns.map(lambda x: x[ACTUAL_COLUMN_COMPONENT])))
+            new_obj: t.Dict[str, t.Any] = {k: [] for k in actual_cols}
+            new_obj[CLUSTER_ID_COL] = []
+            new_obj[TIME_ID_COL] = []
+            for cluster_id, dummy_component in cluster_dummy_product_cols:
+                for actual_col in actual_cols:
+                    col_values = un_normalized_df[(cluster_id, actual_col, dummy_component)].tolist()  # type: ignore
+                    new_obj[actual_col] += col_values
+                new_obj[CLUSTER_ID_COL] += [cluster_id for _ in range(df_len)]
+                new_obj[TIME_ID_COL] += list(range(1, df_len + 1))
+
+            prod_col = _infer_production_column(actual_cols)
+            if prod_col is not None:
+                new_obj[PRODUCTION_COLUMN_NAME] = new_obj.pop(prod_col)
+                actual_cols.remove(prod_col)
+
+            add_prod = [PRODUCTION_COLUMN_NAME] if prod_col is not None else []
+            columns_order = [CLUSTER_ID_COL, TIME_ID_COL] + add_prod + list(actual_cols)
+            df = pd.DataFrame(new_obj).reindex(columns=columns_order).sort_values(by=[TIME_ID_COL, CLUSTER_ID_COL])
+            df.index = pd.Index(list(range(1, len(df) + 1)))
+
+            return df
+
+        else:
+            return self._parse_output_file(file_path)
+
     def _build_dataframe(self, files: t.Sequence[Path], horizon: int) -> pd.DataFrame:
+        assert self.mc_root in [MCRoot.MC_IND, MCRoot.MC_ALL], f"Unknown Monte Carlo root: {self.mc_root}"
         is_details = self.query_file in [
             MCIndAreasQueryFile.DETAILS,
             MCAllAreasQueryFile.DETAILS,
@@ -222,14 +310,14 @@ def _build_dataframe(self, files: t.Sequence[Path], horizon: int) -> pd.DataFram
         final_df = pd.DataFrame()
         nb_files = len(files)
         for k, file_path in enumerate(files):
-            df = self._parse_output_file(file_path)
+            df = self._process_df(file_path, is_details)
 
             # columns filtering
             df = self.columns_filtering(df, is_details)
 
             # if no columns, no need to continue
             list_of_df_columns = df.columns.tolist()
-            if not list_of_df_columns:
+            if not list_of_df_columns or set(list_of_df_columns) == {CLUSTER_ID_COL, TIME_ID_COL}:
                 return pd.DataFrame()
 
             # checks if the estimated dataframe size does not exceed the limit
@@ -239,50 +327,26 @@ def _build_dataframe(self, files: t.Sequence[Path], horizon: int) -> pd.DataFram
                 estimated_binary_size = final_df.memory_usage().sum()
                 _checks_estimated_size(nb_files, estimated_binary_size, k)
 
+            column_name = AREA_COL if self.output_type == "areas" else LINK_COL
+            new_column_order = _columns_ordering(list_of_df_columns, column_name, is_details, self.mc_root)
+
             if self.mc_root == MCRoot.MC_IND:
                 # add column for links/areas
                 relative_path_parts = file_path.relative_to(self.mc_ind_path).parts
-                column_name = AREA_COL if self.output_type == "areas" else LINK_COL
-                new_column_order = [column_name, MCYEAR_COL, TIME_ID_COL, TIME_COL] + list_of_df_columns
                 df[column_name] = relative_path_parts[AREA_OR_LINK_INDEX__IND]
-
                 # add column to record the Monte Carlo year
                 df[MCYEAR_COL] = int(relative_path_parts[MC_YEAR_INDEX])
-
-                # add a column for the time id
-                df[TIME_ID_COL] = list(range(1, len(df) + 1))
-                # add horizon column
-                df[TIME_COL] = horizon
-
-                # Reorganize the columns
-                df = df.reindex(columns=new_column_order)
             else:
-                # first columns df
-                # first we include the time id column
-                first_columns_df = pd.DataFrame(
-                    {
-                        TIME_ID_COL: list(range(1, len(df) + 1)),
-                    }
-                )
-
                 # add column for links/areas
                 relative_path_parts = file_path.relative_to(self.mc_all_path).parts
-                column_name = AREA_COL if self.output_type == "areas" else LINK_COL
-                first_columns_df[column_name] = relative_path_parts[AREA_OR_LINK_INDEX__ALL]
-
-                # add horizon column
-                first_columns_df[TIME_COL] = horizon
-
-                # reorder first columns
-                new_column_order = [column_name, TIME_ID_COL, TIME_COL]
-                first_columns_df = first_columns_df.reindex(columns=new_column_order)
-
-                # reset index
-                # noinspection PyTypeChecker
-                first_columns_df.set_index(df.index, inplace=True)
-
-                # merge first columns with the rest of the df
-                df = pd.merge(first_columns_df, df, left_index=True, right_index=True)
+                df[column_name] = relative_path_parts[AREA_OR_LINK_INDEX__ALL]
+
+            # add a column for the time id
+            df[TIME_ID_COL] = _infer_time_id(df, is_details)
+            # add horizon column
+            df[TIME_COL] = horizon
+            # Reorganize the columns
+            df = df.reindex(columns=pd.Index(new_column_order))
 
             final_df = pd.concat([final_df, df], ignore_index=True)
 

diff --git a/.../integration/raw_studies_blueprint/assets/aggregate_areas_raw_data/test-01-all.result.tsv b/.../integration/raw_studies_blueprint/assets/aggregate_areas_raw_data/test-01-all.result.tsv
@@ -1,4 +1,4 @@
-area	timeId	time	OV. COST	OP. COST	OP. COST.1	OP. COST.2	OP. COST.3	MRG. PRICE	MRG. PRICE.1	MRG. PRICE.2	MRG. PRICE.3	CO2 EMIS.	BALANCE	BALANCE.1	BALANCE.2	BALANCE.3	ROW BAL.	PSP	MISC. NDG	LOAD	LOAD.1	LOAD.2	LOAD.3	H. ROR	H. ROR.1	H. ROR.2	H. ROR.3	WIND	WIND.1	WIND.2	WIND.3	SOLAR	SOLAR.1	SOLAR.2	SOLAR.3	NUCLEAR	NUCLEAR.1	NUCLEAR.2	NUCLEAR.3	LIGNITE	LIGNITE.1	LIGNITE.2	LIGNITE.3	COAL	COAL.1	COAL.2	COAL.3	GAS	GAS.1	GAS.2	GAS.3	OIL	OIL.1	OIL.2	OIL.3	MIX. FUEL	MIX. FUEL.1	MIX. FUEL.2	MIX. FUEL.3	MISC. DTG	MISC. DTG.1	MISC. DTG.2	MISC. DTG.3	H. STOR	H. STOR.1	H. STOR.2	H. STOR.3	H. PUMP	H. PUMP.1	H. PUMP.2	H. PUMP.3	H. LEV	H. LEV.1	H. LEV.2	H. LEV.3	H. INFL	H. INFL.1	H. INFL.2	H. INFL.3	H. OVFL	H. OVFL.1	H. OVFL.2	H. OVFL.3	H. VAL	H. VAL.1	H. VAL.2	H. VAL.3	H. COST	H. COST.1	H. COST.2	H. COST.3	UNSP. ENRG	UNSP. ENRG.1	UNSP. ENRG.2	UNSP. ENRG.3	SPIL. ENRG	SPIL. ENRG.1	SPIL. ENRG.2	SPIL. ENRG.3	LOLD	LOLD.1	LOLD.2	LOLD.3	LOLP	AVL DTG	AVL DTG.1	AVL DTG.2	AVL DTG.3	DTG MRG	DTG MRG.1	DTG MRG.2	DTG MRG.3	MAX MRG	MAX MRG.1	MAX MRG.2	MAX MRG.3	NP COST	NP COST.1	NP COST.2	NP COST.3	NODU	NODU.1	NODU.2	NODU.3
+area	timeId	time	OV. COST EXP	OP. COST EXP	OP. COST STD	OP. COST MIN	OP. COST MAX	MRG. PRICE EXP	MRG. PRICE STD	MRG. PRICE MIN	MRG. PRICE MAX	CO2 EMIS. EXP	BALANCE EXP	BALANCE STD	BALANCE MIN	BALANCE MAX	ROW BAL. VALUES	PSP EXP	MISC. NDG EXP	LOAD EXP	LOAD STD	LOAD MIN	LOAD MAX	H. ROR EXP	H. ROR STD	H. ROR MIN	H. ROR MAX	WIND EXP	WIND STD	WIND MIN	WIND MAX	SOLAR EXP	SOLAR STD	SOLAR MIN	SOLAR MAX	NUCLEAR EXP	NUCLEAR STD	NUCLEAR MIN	NUCLEAR MAX	LIGNITE EXP	LIGNITE STD	LIGNITE MIN	LIGNITE MAX	COAL EXP	COAL STD	COAL MIN	COAL MAX	GAS EXP	GAS STD	GAS MIN	GAS MAX	OIL EXP	OIL STD	OIL MIN	OIL MAX	MIX. FUEL EXP	MIX. FUEL STD	MIX. FUEL MIN	MIX. FUEL MAX	MISC. DTG EXP	MISC. DTG STD	MISC. DTG MIN	MISC. DTG MAX	H. STOR EXP	H. STOR STD	H. STOR MIN	H. STOR MAX	H. PUMP EXP	H. PUMP STD	H. PUMP MIN	H. PUMP MAX	H. LEV EXP	H. LEV STD	H. LEV MIN	H. LEV MAX	H. INFL EXP	H. INFL STD	H. INFL MIN	H. INFL MAX	H. OVFL EXP	H. OVFL STD	H. OVFL MIN	H. OVFL MAX	H. VAL EXP	H. VAL STD	H. VAL MIN	H. VAL MAX	H. COST EXP	H. COST STD	H. COST MIN	H. COST MAX	UNSP. ENRG EXP	UNSP. ENRG STD	UNSP. ENRG MIN	UNSP. ENRG MAX	SPIL. ENRG EXP	SPIL. ENRG STD	SPIL. ENRG MIN	SPIL. ENRG MAX	LOLD EXP	LOLD STD	LOLD MIN	LOLD MAX	LOLP VALUES	AVL DTG EXP	AVL DTG STD	AVL DTG MIN	AVL DTG MAX	DTG MRG EXP	DTG MRG STD	DTG MRG MIN	DTG MRG MAX	MAX MRG EXP	MAX MRG STD	MAX MRG MIN	MAX MRG MAX	NP COST EXP	NP COST STD	NP COST MIN	NP COST MAX	NODU EXP	NODU STD	NODU MIN	NODU MAX
 de	1	2030	282000.0	282000.0	0.0	282000.0	282000.0	11.66	0.0	11.66	11.66	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	27600.0	0.0	27600.0	27600.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	27600.0	0.0	27600.0	27600.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0					0.0	0.0	0.0	0.0									0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	432000.0	0.0	432000.0	432000.0	404400.0	0.0	404400.0	404400.0	404400.0	0.0	404400.0	404400.0	0.0	0.0	0.0	0.0	26.0	0.0	26.0	26.0
 de	2	2030	1252000.0	1252000.0	0.0	1252000.0	1252000.0	23.33	0.0	23.33	23.33	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	85200.0	0.0	85200.0	85200.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	85200.0	0.0	85200.0	85200.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0					0.0	0.0	0.0	0.0									0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	432000.0	0.0	432000.0	432000.0	346800.0	0.0	346800.0	346800.0	346800.0	0.0	346800.0	346800.0	0.0	0.0	0.0	0.0	55.0	0.0	55.0	55.0
 de	3	2030	2910000.0	2910000.0	0.0	2910000.0	2910000.0	35.0	0.0	35.0	35.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	142800.0	0.0	142800.0	142800.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	142800.0	0.0	142800.0	142800.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0					0.0	0.0	0.0	0.0									0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	432000.0	0.0	432000.0	432000.0	289200.0	0.0	289200.0	289200.0	289200.0	0.0	289200.0	289200.0	0.0	0.0	0.0	0.0	83.0	0.0	83.0	83.0

diff --git a/.../integration/raw_studies_blueprint/assets/aggregate_areas_raw_data/test-02-all.result.tsv b/.../integration/raw_studies_blueprint/assets/aggregate_areas_raw_data/test-02-all.result.tsv
@@ -1,2 +1,10 @@
-area	timeId	time	01_SOLAR MWH EXP	02_WIND_ON MWH EXP	03_WIND_OFF MWH EXP	04_RES MWH EXP	05_NUCLEAR MWH EXP	06_COAL MWH EXP	07_GAS MWH EXP	08_NON-RES MWH EXP	09_HYDRO_PUMP MWH EXP	01_SOLAR NP COST - EURO EXP	02_WIND_ON NP COST - EURO EXP	03_WIND_OFF NP COST - EURO EXP	04_RES NP COST - EURO EXP	05_NUCLEAR NP COST - EURO EXP	06_COAL NP COST - EURO EXP	07_GAS NP COST - EURO EXP	08_NON-RES NP COST - EURO EXP	09_HYDRO_PUMP NP COST - EURO EXP	01_SOLAR NODU EXP	02_WIND_ON NODU EXP	03_WIND_OFF NODU EXP	04_RES NODU EXP	05_NUCLEAR NODU EXP	06_COAL NODU EXP	07_GAS NODU EXP	08_NON-RES NODU EXP	09_HYDRO_PUMP NODU EXP
-de	1	2030	315000.0	275000.0	235000.0	195000.0	155000.0	115000.0	75000.0	35000.0	2800.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	167.0	147.0	127.0	107.0	87.0	67.0	47.0	27.0	7.0
+area	cluster	timeId	time	production	NODU	NP Cost - Euro
+de	01_solar	1	2030	315000.0	167.0	0.0
+de	02_wind_on	1	2030	275000.0	147.0	0.0
+de	03_wind_off	1	2030	235000.0	127.0	0.0
+de	04_res	1	2030	195000.0	107.0	0.0
+de	05_nuclear	1	2030	155000.0	87.0	0.0
+de	06_coal	1	2030	115000.0	67.0	0.0
+de	07_gas	1	2030	75000.0	47.0	0.0
+de	08_non-res	1	2030	35000.0	27.0	0.0
+de	09_hydro_pump	1	2030	2800.0	7.0	0.0