From c09fd4062c70a12eb99e29e263844b53083aecfb Mon Sep 17 00:00:00 2001 From: Miki Verma Date: Tue, 24 Sep 2024 23:32:05 +0200 Subject: [PATCH] run black linter --- .github/workflows/lint.yml | 46 + deprecated/Yearly_model/Full_data_creation.py | 209 +++-- .../Yearly_model/Metadata_upgrade_creation.py | 466 ++++++---- deprecated/Yearly_model/Resstock_model.py | 475 ++++++---- deprecated/Yearly_model/Weather_gen.py | 202 +++-- deprecated/Yearly_model/util_datagen.py | 71 +- deprecated/infra_agnostic/datagen.py | 814 ++++++++++-------- deprecated/infra_agnostic/model.py | 239 ++--- deprecated/infra_agnostic/utils.py | 9 +- scripts/build_feature_store.py | 36 +- scripts/extract_data.py | 14 +- scripts/gpu_usage.py | 6 +- scripts/model_evaluation.py | 14 +- scripts/model_training.py | 20 +- src/datagen.py | 96 ++- src/gpuutils.py | 3 +- src/surrogate_model.py | 141 ++- 17 files changed, 1752 insertions(+), 1109 deletions(-) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..6634318 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,46 @@ +# Run various link checks, like flake8 and black, to make sure +# our code remains in good shape, avoids common bugs, and follows +# common coding conventions. +name: lint + +on: + push: + branches-ignore: + - main + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + linting: + runs-on: ubuntu-latest + steps: + #---------------------------------------------- + # check-out repo and set-up python + #---------------------------------------------- + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + #---------------------------------------------- + # load pip cache if cache exists + #---------------------------------------------- + - uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip + restore-keys: ${{ runner.os }}-pip + #---------------------------------------------- + # run isort + #---------------------------------------------- + - run: python -m pip install isort + - run: | + isort --profile black ./src ./tests + #---------------------------------------------- + # run black + #---------------------------------------------- + - run: python -m pip install black[jupyter] + - run: | + black ./src ./tests --diff + black ./src ./tests --check + diff --git a/deprecated/Yearly_model/Full_data_creation.py b/deprecated/Yearly_model/Full_data_creation.py index 05ba5b1..f08bd64 100644 --- a/deprecated/Yearly_model/Full_data_creation.py +++ b/deprecated/Yearly_model/Full_data_creation.py @@ -1,18 +1,18 @@ # Databricks notebook source -# MAGIC %md # Creating the Full Dataset used for modeling. +# MAGIC %md # Creating the Full Dataset used for modeling. # MAGIC # MAGIC ### Goal -# MAGIC Join the ResStock outputs to weather and metadata. And apply the necessary aggregation to energy consumption. +# MAGIC Join the ResStock outputs to weather and metadata. And apply the necessary aggregation to energy consumption. # MAGIC # MAGIC ### Process # MAGIC Please create the weather_data tables for the required aggregation, and the metadata with upgrades table, if you have not already done so, before running this notebook # MAGIC -# MAGIC ##### Inputs: +# MAGIC ##### Inputs: # MAGIC - `building_model.resstock_outputs_hourly`: ResStock Hourly output table # MAGIC - `building_model.weather_data_yearly`: Contains yearly weather data. Can also use monthly or daily we just need to change the suffix # MAGIC - `building_model.metadata_w_upgrades`: Contains metadata with upgrades. # MAGIC -# MAGIC ##### Outputs: +# MAGIC ##### Outputs: # MAGIC - `building_model.resstock_yearly_with_metadata_weather_upgrades`: Contains the final table used for modeling. Can also have a daily or monthly version as well # MAGIC # MAGIC @@ -23,14 +23,15 @@ import pyspark.sql.functions as F from pyspark.sql.functions import col from pyspark.sql.functions import avg + spark.conf.set("spark.sql.shuffle.partitions", 1536) # COMMAND ---------- -resstock_path = 'building_model.resstock_outputs_hourly' -weather_data_full_path = 'building_model.weather_data_yearly' -metadata_path = 'building_model.metadata_w_upgrades' +resstock_path = "building_model.resstock_outputs_hourly" +weather_data_full_path = "building_model.weather_data_yearly" +metadata_path = "building_model.metadata_w_upgrades" resstock = spark.table(resstock_path) metadata = spark.table(metadata_path) @@ -40,75 +41,161 @@ ##define end uses by fuel type. And select the columns corresponding to them -heating_electric = ['out_electricity_heating_fans_pumps_energy_consumption_kwh', 'out_electricity_heating_hp_bkup_energy_consumption_kwh', 'out_electricity_heating_energy_consumption_kwh'] +heating_electric = [ + "out_electricity_heating_fans_pumps_energy_consumption_kwh", + "out_electricity_heating_hp_bkup_energy_consumption_kwh", + "out_electricity_heating_energy_consumption_kwh", +] -cooling_electric = ['out_electricity_cooling_fans_pumps_energy_consumption_kwh', - 'out_electricity_cooling_energy_consumption_kwh'] +cooling_electric = [ + "out_electricity_cooling_fans_pumps_energy_consumption_kwh", + "out_electricity_cooling_energy_consumption_kwh", +] -heating_nat_gas = ['out_natural_gas_heating_hp_bkup_energy_consumption_kwh','out_natural_gas_heating_energy_consumption_kwh'] +heating_nat_gas = [ + "out_natural_gas_heating_hp_bkup_energy_consumption_kwh", + "out_natural_gas_heating_energy_consumption_kwh", +] -heating_fuel_oil =['out_fuel_oil_heating_hp_bkup_energy_consumption_kwh','out_fuel_oil_heating_energy_consumption_kwh'] +heating_fuel_oil = [ + "out_fuel_oil_heating_hp_bkup_energy_consumption_kwh", + "out_fuel_oil_heating_energy_consumption_kwh", +] -heating_propane = ['out_propane_heating_hp_bkup_energy_consumption_kwh', - 'out_propane_heating_energy_consumption_kwh'] +heating_propane = [ + "out_propane_heating_hp_bkup_energy_consumption_kwh", + "out_propane_heating_energy_consumption_kwh", +] # COMMAND ---------- - -resstock = (resstock.withColumn( - 'out_electricity_heating_total', sum(resstock[col] for col in heating_electric)).withColumn( - 'out_electricity_cooling_total', sum(resstock[col] for col in cooling_electric)).withColumn( - 'out_natural_gas_heating_total', sum(resstock[col] for col in heating_nat_gas)).withColumn( - 'out_fuel_oil_heating_total', sum(resstock[col] for col in heating_fuel_oil)).withColumn('out_propane_heating_total', sum(resstock[col] for col in heating_propane)) - ) -drop_list = heating_electric + cooling_electric + heating_fuel_oil + heating_nat_gas + heating_propane +resstock = ( + resstock.withColumn( + "out_electricity_heating_total", sum(resstock[col] for col in heating_electric) + ) + .withColumn( + "out_electricity_cooling_total", sum(resstock[col] for col in cooling_electric) + ) + .withColumn( + "out_natural_gas_heating_total", sum(resstock[col] for col in heating_nat_gas) + ) + .withColumn( + "out_fuel_oil_heating_total", sum(resstock[col] for col in heating_fuel_oil) + ) + .withColumn( + "out_propane_heating_total", sum(resstock[col] for col in heating_propane) + ) +) + +drop_list = ( + heating_electric + + cooling_electric + + heating_fuel_oil + + heating_nat_gas + + heating_propane +) resstock = resstock.drop(*drop_list) # COMMAND ---------- from pyspark.sql.functions import sum + def Create_full_data(resstock, metadata, weather, aggregation_level, table_write_path): - if aggregation_level == 'yearly': - resstock_yearly = (resstock).groupBy('building_id','upgrade_id').agg( - *[sum(col).alias("sum_" + col) for col in resstock.columns if col not in ['building_id', 'month','upgrade_id', 'day', 'hour', 'weekday', 'timestamp']]) - - resstock_yearly_with_metadata = ( - resstock_yearly - .join(broadcast(metadata), on = ['building_id', 'upgrade_id'])) - - resstock_yearly_with_metadata_weather = ( - resstock_yearly_with_metadata - .join(broadcast(weather), on = ['county_geoid'])) + if aggregation_level == "yearly": + resstock_yearly = ( + (resstock) + .groupBy("building_id", "upgrade_id") + .agg( + *[ + sum(col).alias("sum_" + col) + for col in resstock.columns + if col + not in [ + "building_id", + "month", + "upgrade_id", + "day", + "hour", + "weekday", + "timestamp", + ] + ] + ) + ) + + resstock_yearly_with_metadata = resstock_yearly.join( + broadcast(metadata), on=["building_id", "upgrade_id"] + ) + + resstock_yearly_with_metadata_weather = resstock_yearly_with_metadata.join( + broadcast(weather), on=["county_geoid"] + ) resstock_yearly_with_metadata_weather.write.saveAsTable(table_write_path) - elif aggregation_level == 'monthly': - resstock_monthly = (resstock).groupBy('building_id', 'month', 'upgrade_id').agg( - *[sum(col).alias("sum_" + col) for col in resstock.columns if col not in ['building_id', 'month','upgrade_id', 'day', 'hour', 'weekday', 'timestamp']]) - - resstock_monthly_with_metadata = ( - resstock_monthly - .join(broadcast(metadata), on = ['building_id', 'upgrade_id'])) - - resstock_monthly_with_metadata_weather = ( - resstock_monthly_with_metadata - .join(broadcast(weather), on = ['county_geoid', 'month'])) - + elif aggregation_level == "monthly": + resstock_monthly = ( + (resstock) + .groupBy("building_id", "month", "upgrade_id") + .agg( + *[ + sum(col).alias("sum_" + col) + for col in resstock.columns + if col + not in [ + "building_id", + "month", + "upgrade_id", + "day", + "hour", + "weekday", + "timestamp", + ] + ] + ) + ) + + resstock_monthly_with_metadata = resstock_monthly.join( + broadcast(metadata), on=["building_id", "upgrade_id"] + ) + + resstock_monthly_with_metadata_weather = resstock_monthly_with_metadata.join( + broadcast(weather), on=["county_geoid", "month"] + ) + resstock_monthly_with_metadata_weather.write.saveAsTable(table_write_path) - - elif aggregation_level == 'daily': - resstock_daily = (resstock).groupBy('building_id', 'day', 'month', 'upgrade_id').agg( - *[sum(col).alias("sum_" + col) for col in resstock.columns if col not in ['building_id', 'month','upgrade_id', 'day', 'hour', 'weekday', 'timestamp']]) - - resstock_daily_with_metadata = ( - resstock_daily - .join(broadcast(metadata), on = ['building_id', 'upgrade_id'])) - - resstock_daily_with_metadata_weather = ( - resstock_daily_with_metadata - .join(broadcast(weather), on = ['county_geoid', 'day', 'month'])) + + elif aggregation_level == "daily": + resstock_daily = ( + (resstock) + .groupBy("building_id", "day", "month", "upgrade_id") + .agg( + *[ + sum(col).alias("sum_" + col) + for col in resstock.columns + if col + not in [ + "building_id", + "month", + "upgrade_id", + "day", + "hour", + "weekday", + "timestamp", + ] + ] + ) + ) + + resstock_daily_with_metadata = resstock_daily.join( + broadcast(metadata), on=["building_id", "upgrade_id"] + ) + + resstock_daily_with_metadata_weather = resstock_daily_with_metadata.join( + broadcast(weather), on=["county_geoid", "day", "month"] + ) resstock_daily_with_metadata_weather.write.saveAsTable(table_write_path) @@ -120,4 +207,10 @@ def Create_full_data(resstock, metadata, weather, aggregation_level, table_write table_write_path = "building_model.resstock_yearly_with_metadata_weather_upgrades" -Create_full_data(resstock = resstock, metadata = metadata, weather =weather, aggregation_level = 'yearly', table_write_path = table_write_path) +Create_full_data( + resstock=resstock, + metadata=metadata, + weather=weather, + aggregation_level="yearly", + table_write_path=table_write_path, +) diff --git a/deprecated/Yearly_model/Metadata_upgrade_creation.py b/deprecated/Yearly_model/Metadata_upgrade_creation.py index 3d990cc..a7d1fad 100644 --- a/deprecated/Yearly_model/Metadata_upgrade_creation.py +++ b/deprecated/Yearly_model/Metadata_upgrade_creation.py @@ -5,19 +5,19 @@ # MAGIC Given a metadata file, we will apply upgrades to it and append it to the original metadata. We then write this to a spark table. # MAGIC # MAGIC ### Process -# MAGIC Apply upgrades as given in the YAML file. This file does not automatically parse the YAML code, instead we have translated the logic for the first five upgrades. +# MAGIC Apply upgrades as given in the YAML file. This file does not automatically parse the YAML code, instead we have translated the logic for the first five upgrades. # MAGIC -# MAGIC ##### Inputs: +# MAGIC ##### Inputs: # MAGIC - `building_model.resstock_metadata`: Metadata for base building models # MAGIC -# MAGIC ##### Outputs: +# MAGIC ##### Outputs: # MAGIC - `building_model.metadata_w_upgrades`: weather features on yearly aggregation. Can also create monthly or daily aggregations in which case we will have building_model.weather_data_daily or building_model.weather_data_monthly # MAGIC # MAGIC ### TODOs: # MAGIC # MAGIC #### Future Work # MAGIC - Add upgrades 6-10 -# MAGIC - Add in automatic parsing of YAML files +# MAGIC - Add in automatic parsing of YAML files # COMMAND ---------- @@ -31,98 +31,133 @@ from pyspark.sql.functions import avg import pandas as pd import util_datagen + spark.conf.set("spark.sql.shuffle.partitions", 1536) # COMMAND ---------- # Resstock metadata loading and preprocessing -metadata = spark.table('building_model.resstock_metadata') +metadata = spark.table("building_model.resstock_metadata") -#eligible_households = ['Single-Family Detached', 'Single-Family Attached'] -eligible_households = ['Single-Family Attached'] -metadata = metadata.filter(col("in_geometry_building_type_acs").isin(eligible_households)) +# eligible_households = ['Single-Family Detached', 'Single-Family Attached'] +eligible_households = ["Single-Family Attached"] +metadata = metadata.filter( + col("in_geometry_building_type_acs").isin(eligible_households) +) ## remove ineligible Other fuels Resstock doesn't model this -ineligible_fuels = ['Other Fuel'] -metadata = (metadata.filter(~col("in_heating_fuel").isin(ineligible_fuels))) +ineligible_fuels = ["Other Fuel"] +metadata = metadata.filter(~col("in_heating_fuel").isin(ineligible_fuels)) ## also remove shared cooling systems and shared heating systems (small number still left after previous filter) -metadata = (metadata.filter(col("in_hvac_cooling_type") != 'Shared Cooling')) -metadata = (metadata.filter(col("in_hvac_heating_efficiency") != 'Shared Heating')) - -drop_list = ['in_census_division', 'in_ahs_region', 'puma_geoid', 'in_weather_file_latitude', 'in_weather_file_longitude', 'in_sqft_bin', 'in_occupants_bin', 'in_income', 'in_geometry_floor_area_bin'] +metadata = metadata.filter(col("in_hvac_cooling_type") != "Shared Cooling") +metadata = metadata.filter(col("in_hvac_heating_efficiency") != "Shared Heating") + +drop_list = [ + "in_census_division", + "in_ahs_region", + "puma_geoid", + "in_weather_file_latitude", + "in_weather_file_longitude", + "in_sqft_bin", + "in_occupants_bin", + "in_income", + "in_geometry_floor_area_bin", +] metadata = metadata.drop(*drop_list) -#convert to pandas dataframe +# convert to pandas dataframe metadata = metadata.toPandas() # COMMAND ---------- ## metadata feature creation -metadata['upgrade_id'] = 0 -metadata['in_hvac_backup_heating_efficiency_nominal_percent'] = 'None' -metadata['in_backup_heating_fuel'] = 'None' +metadata["upgrade_id"] = 0 +metadata["in_hvac_backup_heating_efficiency_nominal_percent"] = "None" +metadata["in_backup_heating_fuel"] = "None" -met_conditions = metadata["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) -metadata.loc[met_conditions, 'in_hvac_backup_heating_efficiency_nominal_percent'] = '100%' -metadata.loc[met_conditions, 'in_backup_heating_fuel'] = 'Electricity' +met_conditions = metadata["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) +metadata.loc[met_conditions, "in_hvac_backup_heating_efficiency_nominal_percent"] = ( + "100%" +) +metadata.loc[met_conditions, "in_backup_heating_fuel"] = "Electricity" # COMMAND ---------- - metadata_upgrade1 = metadata.copy() + + def attic_insulation_IECC_CZ1A(df): - met_conditions = (df["in_ashrae_iecc_climate_zone_2004"] == "1A") & \ - (df["in_geometry_attic_type"] == "Vented Attic") & \ - (df["in_insulation_ceiling"].isin(["Uninsulated", "R-7", "R-13"])) - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_ceiling"] = "R-30" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = ( + (df["in_ashrae_iecc_climate_zone_2004"] == "1A") + & (df["in_geometry_attic_type"] == "Vented Attic") + & (df["in_insulation_ceiling"].isin(["Uninsulated", "R-7", "R-13"])) + ) + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_ceiling"] = "R-30" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def attic_insulation_IECC_CZ2A_2B_3A_3B_3C(df): - met_conditions = (df["in_ashrae_iecc_climate_zone_2004"].isin(["2A", "2B", "3A", "3B", "3C"])) & \ - (df["in_geometry_attic_type"] == "Vented Attic") & \ - (df["in_insulation_ceiling"].isin(["Uninsulated", "R-7", "R-13", "R-19", "R-30"])) - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_ceiling"] = "R-49" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = ( + (df["in_ashrae_iecc_climate_zone_2004"].isin(["2A", "2B", "3A", "3B", "3C"])) + & (df["in_geometry_attic_type"] == "Vented Attic") + & ( + df["in_insulation_ceiling"].isin( + ["Uninsulated", "R-7", "R-13", "R-19", "R-30"] + ) + ) + ) + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_ceiling"] = "R-49" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def attic_insulation_IECC_CZ4A_7C(df): - met_conditions = (df["in_ashrae_iecc_climate_zone_2004"].isin(["4A", "4B", "4C", "5A", "5B", "6A", "6B", "7A", "7B"])) & \ - (df["in_geometry_attic_type"] == "Vented Attic") & \ - (df["in_insulation_ceiling"].isin(["Uninsulated", "R-7", "R-13", "R-19", "R-30", "R-38"])) - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_ceiling"] = "R-60" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = ( + ( + df["in_ashrae_iecc_climate_zone_2004"].isin( + ["4A", "4B", "4C", "5A", "5B", "6A", "6B", "7A", "7B"] + ) + ) + & (df["in_geometry_attic_type"] == "Vented Attic") + & ( + df["in_insulation_ceiling"].isin( + ["Uninsulated", "R-7", "R-13", "R-19", "R-30", "R-38"] + ) + ) + ) + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_ceiling"] = "R-60" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def infiltration_30pct_reduction(df): - met_conditions = df["in_infiltration_ach50"] >= 15 - if met_conditions.any(): - df.loc[met_conditions, "in_infiltration_ach50"] *= 0.7 - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = df["in_infiltration_ach50"] >= 15 + if met_conditions.any(): + df.loc[met_conditions, "in_infiltration_ach50"] *= 0.7 + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def ducts_leakage(df): - met_conditions = df["in_ducts_leakage"] > 0 - if met_conditions.any(): - df.loc[met_conditions, "in_ducts_leakage"] = 10 - df.loc[met_conditions, "in_ducts_insulation"] = 'R-8' - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = df["in_ducts_leakage"] > 0 + if met_conditions.any(): + df.loc[met_conditions, "in_ducts_leakage"] = 10 + df.loc[met_conditions, "in_ducts_insulation"] = "R-8" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def Drill_and_fill(df): - met_conditions = df["in_insulation_wall"] == "Wood Stud, Uninsulated" - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_wall"] = "Wood Stud, R-13" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = df["in_insulation_wall"] == "Wood Stud, Uninsulated" + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_wall"] = "Wood Stud, R-13" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 + def apply_upgrade_01(df): - df['upgrade_id'] = 1 + df["upgrade_id"] = 1 df["eligible_for_upgrade"] = 0 attic_insulation_IECC_CZ1A(df) attic_insulation_IECC_CZ2A_2B_3A_3B_3C(df) @@ -137,108 +172,152 @@ def apply_upgrade_01(df): def apply_upgrade_foundation_wall_insulation(df): - met_conditions = (df["in_geometry_foundation_type"].isin(["Unvented Crawlspace", "Vented Crawlspace"])) & \ - (df["in_insulation_foundation_wall"] == "Uninsulated") - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_foundation_wall"] = "Wall R-10, Interior" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = ( + df["in_geometry_foundation_type"].isin( + ["Unvented Crawlspace", "Vented Crawlspace"] + ) + ) & (df["in_insulation_foundation_wall"] == "Uninsulated") + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_foundation_wall"] = "Wall R-10, Interior" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def apply_upgrade_foundation_wall_insulation_finished_basement(df): - met_conditions = (df["in_geometry_foundation_type"] == "Heated Basement") & \ - (df["in_insulation_foundation_wall"] == "Uninsulated") - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_foundation_wall"] = "Wall R-10, Interior" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = (df["in_geometry_foundation_type"] == "Heated Basement") & ( + df["in_insulation_foundation_wall"] == "Uninsulated" + ) + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_foundation_wall"] = "Wall R-10, Interior" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def apply_upgrade_rim_joist_insulation(df): - met_conditions = (df["in_geometry_foundation_type"].isin(["Unvented Crawlspace", "Vented Crawlspace", "Heated Basement"])) & \ - (df["in_insulation_foundation_wall"] == "Uninsulated") - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_rim_joist"] = "R-10, Exterior" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = ( + df["in_geometry_foundation_type"].isin( + ["Unvented Crawlspace", "Vented Crawlspace", "Heated Basement"] + ) + ) & (df["in_insulation_foundation_wall"] == "Uninsulated") + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_rim_joist"] = "R-10, Exterior" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def apply_upgrade_seal_vented_crawlspaces(df): - met_conditions = (df["in_geometry_foundation_type"] == "Unvented Crawlspace") - if met_conditions.any(): - df.loc[met_conditions, "in_geometry_foundation_type"] = "Unvented Crawlspace" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = df["in_geometry_foundation_type"] == "Unvented Crawlspace" + if met_conditions.any(): + df.loc[met_conditions, "in_geometry_foundation_type"] = "Unvented Crawlspace" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def apply_upgrade_roof_insulation_to_R30(df): - met_conditions = (df["in_geometry_attic_type"] == "Finished Attic or Cathedral Ceilings") & \ - (df["in_insulation_roof"].isin(["Finished, Uninsulated", "Finished, R-7", "Finished, R-13"])) - if met_conditions.any(): - df.loc[met_conditions, "in_insulation_roof"] = "Finished, R-30" - df.loc[met_conditions, "eligible_for_upgrade"] = 1 + met_conditions = ( + df["in_geometry_attic_type"] == "Finished Attic or Cathedral Ceilings" + ) & ( + df["in_insulation_roof"].isin( + ["Finished, Uninsulated", "Finished, R-7", "Finished, R-13"] + ) + ) + if met_conditions.any(): + df.loc[met_conditions, "in_insulation_roof"] = "Finished, R-30" + df.loc[met_conditions, "eligible_for_upgrade"] = 1 def apply_upgrade_02(df): - df['upgrade_id'] = 2 - df["eligible_for_upgrade"] = 0 - attic_insulation_IECC_CZ1A(df) - attic_insulation_IECC_CZ2A_2B_3A_3B_3C(df) - attic_insulation_IECC_CZ4A_7C(df) - infiltration_30pct_reduction(df) - ducts_leakage(df) - Drill_and_fill(df) - apply_upgrade_foundation_wall_insulation(df) - apply_upgrade_foundation_wall_insulation_finished_basement(df) - apply_upgrade_rim_joist_insulation(df) - apply_upgrade_seal_vented_crawlspaces(df) - apply_upgrade_roof_insulation_to_R30(df) - return df + df["upgrade_id"] = 2 + df["eligible_for_upgrade"] = 0 + attic_insulation_IECC_CZ1A(df) + attic_insulation_IECC_CZ2A_2B_3A_3B_3C(df) + attic_insulation_IECC_CZ4A_7C(df) + infiltration_30pct_reduction(df) + ducts_leakage(df) + Drill_and_fill(df) + apply_upgrade_foundation_wall_insulation(df) + apply_upgrade_foundation_wall_insulation_finished_basement(df) + apply_upgrade_rim_joist_insulation(df) + apply_upgrade_seal_vented_crawlspaces(df) + apply_upgrade_roof_insulation_to_R30(df) + return df # COMMAND ---------- def apply_upgrade_03(df): - df["eligible_for_upgrade"] = 0 - df['upgrade_id'] = 3 - apply_logic_asHP = ((df["in_hvac_has_ducts"] == True) & - (~df["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) | - df["in_hvac_heating_efficiency"].isin(["ASHP, SEER 10, 6.2 HSPF", - "ASHP, SEER 13, 7.7 HSPF", - "ASHP, SEER 15, 8.5 HSPF"]))) - - apply_logic_msHP = ((df["in_hvac_has_ducts"] == False) & - (~df["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) | - df["in_hvac_heating_efficiency"].isin(["MSHP, SEER 14.5, 8.2 HSPF"]))) - - df.loc[apply_logic_asHP, "in_hvac_heating_efficiency"] = "ASHP, SEER 15, 9.0 HSPF" - df.loc[apply_logic_msHP, "in_hvac_heating_efficiency"] = "MSHP, SEER 15, 9.0 HSPF, Max Load" - - df.loc[apply_logic_asHP | apply_logic_msHP, "eligible_for_upgrade"] = 1 - df.loc[apply_logic_asHP | apply_logic_msHP, "in_hvac_cooling_type"] = 'Heat Pump' - df.loc[apply_logic_asHP | apply_logic_msHP, "in_hvac_cooling_partial_space_conditioning"] = '100%' - df.loc[apply_logic_asHP | apply_logic_msHP, "in_heating_fuel"] = "Electricity" - df.loc[apply_logic_asHP | apply_logic_msHP, "in_hvac_backup_heating_efficiency_nominal_percent"] = "100%" - return df + df["eligible_for_upgrade"] = 0 + df["upgrade_id"] = 3 + apply_logic_asHP = (df["in_hvac_has_ducts"] == True) & ( + ~df["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) + | df["in_hvac_heating_efficiency"].isin( + [ + "ASHP, SEER 10, 6.2 HSPF", + "ASHP, SEER 13, 7.7 HSPF", + "ASHP, SEER 15, 8.5 HSPF", + ] + ) + ) + + apply_logic_msHP = (df["in_hvac_has_ducts"] == False) & ( + ~df["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) + | df["in_hvac_heating_efficiency"].isin(["MSHP, SEER 14.5, 8.2 HSPF"]) + ) + + df.loc[apply_logic_asHP, "in_hvac_heating_efficiency"] = "ASHP, SEER 15, 9.0 HSPF" + df.loc[apply_logic_msHP, "in_hvac_heating_efficiency"] = ( + "MSHP, SEER 15, 9.0 HSPF, Max Load" + ) + + df.loc[apply_logic_asHP | apply_logic_msHP, "eligible_for_upgrade"] = 1 + df.loc[apply_logic_asHP | apply_logic_msHP, "in_hvac_cooling_type"] = "Heat Pump" + df.loc[ + apply_logic_asHP | apply_logic_msHP, + "in_hvac_cooling_partial_space_conditioning", + ] = "100%" + df.loc[apply_logic_asHP | apply_logic_msHP, "in_heating_fuel"] = "Electricity" + df.loc[ + apply_logic_asHP | apply_logic_msHP, + "in_hvac_backup_heating_efficiency_nominal_percent", + ] = "100%" + return df def apply_upgrade_04(df): - df['upgrade_id'] = 4 - df["eligible_for_upgrade"] = 0 - apply_logic_ducted_msHP = (df["in_hvac_has_ducts"] == True) - - apply_logic_nonducted_msHP = ((df["in_hvac_has_ducts"] == False) & - (~df["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) | - df["in_hvac_heating_efficiency"].isin(["MSHP, SEER 14.5, 8.2 HSPF", - "MSHP, SEER 29.3, 14 HSPF, Max Load"]))) - - df.loc[apply_logic_ducted_msHP, "in_hvac_heating_efficiency"] = "MSHP, SEER 24, 13 HSPF" - - df.loc[apply_logic_nonducted_msHP, "in_hvac_heating_efficiency"] = "MSHP, SEER 29.3, 14 HSPF, Max Load" - - df.loc[apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "eligible_for_upgrade"] = 1 - df.loc[apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "in_hvac_cooling_type"] = 'Heat Pump' - df.loc[apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "in_hvac_cooling_partial_space_conditioning"] = '100%' - df.loc[apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "in_heating_fuel"] = "Electricity" - df.loc[apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "in_hvac_backup_heating_efficiency_nominal_percent"] = "100%" - return df + df["upgrade_id"] = 4 + df["eligible_for_upgrade"] = 0 + apply_logic_ducted_msHP = df["in_hvac_has_ducts"] == True + + apply_logic_nonducted_msHP = (df["in_hvac_has_ducts"] == False) & ( + ~df["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) + | df["in_hvac_heating_efficiency"].isin( + ["MSHP, SEER 14.5, 8.2 HSPF", "MSHP, SEER 29.3, 14 HSPF, Max Load"] + ) + ) + + df.loc[apply_logic_ducted_msHP, "in_hvac_heating_efficiency"] = ( + "MSHP, SEER 24, 13 HSPF" + ) + + df.loc[apply_logic_nonducted_msHP, "in_hvac_heating_efficiency"] = ( + "MSHP, SEER 29.3, 14 HSPF, Max Load" + ) + + df.loc[ + apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "eligible_for_upgrade" + ] = 1 + df.loc[ + apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "in_hvac_cooling_type" + ] = "Heat Pump" + df.loc[ + apply_logic_ducted_msHP | apply_logic_nonducted_msHP, + "in_hvac_cooling_partial_space_conditioning", + ] = "100%" + df.loc[apply_logic_ducted_msHP | apply_logic_nonducted_msHP, "in_heating_fuel"] = ( + "Electricity" + ) + df.loc[ + apply_logic_ducted_msHP | apply_logic_nonducted_msHP, + "in_hvac_backup_heating_efficiency_nominal_percent", + ] = "100%" + return df # COMMAND ---------- @@ -247,46 +326,53 @@ def apply_upgrade_04(df): def apply_upgrade_05(df): - df['upgrade_id'] = 5 - apply_logic_asHP = (df["in_hvac_cooling_type"] == 'Heat Pump') - - apply_logic_else = (df["in_hvac_cooling_type"] != 'Heat Pump') - - df.loc[apply_logic_asHP, "in_hvac_backup_heating_efficiency_nominal_percent"] = "100%" - df.loc[apply_logic_asHP, "in_hvac_heating_efficiency"] = "ASHP, SEER 15, 9.0 HSPF" - - - df.loc[apply_logic_else, "in_hvac_backup_heating_efficiency_nominal_percent"] = df.loc[apply_logic_else, "in_hvac_heating_efficiency_nominal_percent"] - df.loc[apply_logic_else, "in_hvac_heating_efficiency"] = "ASHP, SEER 15, 9.0 HSPF" - - df["eligible_for_upgrade"] = 1 - df[ "in_hvac_cooling_type"] = 'Heat Pump' - df["in_hvac_cooling_partial_space_conditioning"] = '100%' - df['in_heating_fuel'] = "Electricity" - df['in_backup_heating_fuel'] = df['in_heating_fuel'] - return df - + df["upgrade_id"] = 5 + apply_logic_asHP = df["in_hvac_cooling_type"] == "Heat Pump" + + apply_logic_else = df["in_hvac_cooling_type"] != "Heat Pump" + + df.loc[apply_logic_asHP, "in_hvac_backup_heating_efficiency_nominal_percent"] = ( + "100%" + ) + df.loc[apply_logic_asHP, "in_hvac_heating_efficiency"] = "ASHP, SEER 15, 9.0 HSPF" + + df.loc[apply_logic_else, "in_hvac_backup_heating_efficiency_nominal_percent"] = ( + df.loc[apply_logic_else, "in_hvac_heating_efficiency_nominal_percent"] + ) + df.loc[apply_logic_else, "in_hvac_heating_efficiency"] = "ASHP, SEER 15, 9.0 HSPF" + + df["eligible_for_upgrade"] = 1 + df["in_hvac_cooling_type"] = "Heat Pump" + df["in_hvac_cooling_partial_space_conditioning"] = "100%" + df["in_heating_fuel"] = "Electricity" + df["in_backup_heating_fuel"] = df["in_heating_fuel"] + return df # COMMAND ---------- + def apply_all_upgrades(df): - ''' Creates a new dataframe with all of the upgrades attached + """Creates a new dataframe with all of the upgrades attached - Currently applies all 5 of the upgrades available. + Currently applies all 5 of the upgrades available. - Args: - df: A pandas dataframe containing the metadata for the base buildings + Args: + df: A pandas dataframe containing the metadata for the base buildings - Returns: - A pandas dataframe containing metadata for the base building and all of the upgraded buildings. - ''' - return pd.concat([df, - apply_upgrade_01(df.copy()), - apply_upgrade_02(df.copy()), - apply_upgrade_03(df.copy()), - apply_upgrade_04(df.copy()), - apply_upgrade_05(df.copy())]) + Returns: + A pandas dataframe containing metadata for the base building and all of the upgraded buildings. + """ + return pd.concat( + [ + df, + apply_upgrade_01(df.copy()), + apply_upgrade_02(df.copy()), + apply_upgrade_03(df.copy()), + apply_upgrade_04(df.copy()), + apply_upgrade_05(df.copy()), + ] + ) # COMMAND ---------- @@ -297,43 +383,63 @@ def apply_all_upgrades(df): ## preprocessing of features -metadata_w_upgrades['in_vintage'] = metadata_w_upgrades['in_vintage'].apply(util_datagen.vintage2age2010) +metadata_w_upgrades["in_vintage"] = metadata_w_upgrades["in_vintage"].apply( + util_datagen.vintage2age2010 +) -metadata_w_upgrades['in_ducts_leakage'] = metadata_w_upgrades['in_ducts_leakage'].fillna(0) +metadata_w_upgrades["in_ducts_leakage"] = metadata_w_upgrades[ + "in_ducts_leakage" +].fillna(0) -metadata_w_upgrades['in_geometry_stories'] = metadata_w_upgrades['in_geometry_stories'].astype(float) +metadata_w_upgrades["in_geometry_stories"] = metadata_w_upgrades[ + "in_geometry_stories" +].astype(float) -metadata_w_upgrades['in_hvac_heating_efficiency_nominal_percent'] = metadata_w_upgrades['in_hvac_heating_efficiency'].apply(util_datagen.convert_heating_efficiency) +metadata_w_upgrades["in_hvac_heating_efficiency_nominal_percent"] = metadata_w_upgrades[ + "in_hvac_heating_efficiency" +].apply(util_datagen.convert_heating_efficiency) -metadata_w_upgrades['in_hvac_seer_rating'] = metadata_w_upgrades['in_hvac_heating_efficiency'].apply(util_datagen.extract_seer) +metadata_w_upgrades["in_hvac_seer_rating"] = metadata_w_upgrades[ + "in_hvac_heating_efficiency" +].apply(util_datagen.extract_seer) -metadata_w_upgrades['in_hvac_hspf_rating'] = metadata_w_upgrades['in_hvac_heating_efficiency'].apply(util_datagen.extract_hspf) +metadata_w_upgrades["in_hvac_hspf_rating"] = metadata_w_upgrades[ + "in_hvac_heating_efficiency" +].apply(util_datagen.extract_hspf) -metadata_w_upgrades['in_hvac_afue_rating'] = metadata_w_upgrades['in_hvac_heating_efficiency'].apply(util_datagen.extract_afue) +metadata_w_upgrades["in_hvac_afue_rating"] = metadata_w_upgrades[ + "in_hvac_heating_efficiency" +].apply(util_datagen.extract_afue) # COMMAND ---------- ## obtain cooling efficiency -met_conditions = metadata_w_upgrades["in_hvac_cooling_type"].str.contains("Heat Pump", na=False) -metadata_w_upgrades.loc[met_conditions, "in_hvac_cooling_efficiency"] = "SEER " + " " + metadata_w_upgrades.loc[met_conditions, "in_hvac_seer_rating"].astype(str) +met_conditions = metadata_w_upgrades["in_hvac_cooling_type"].str.contains( + "Heat Pump", na=False +) +metadata_w_upgrades.loc[met_conditions, "in_hvac_cooling_efficiency"] = ( + "SEER " + + " " + + metadata_w_upgrades.loc[met_conditions, "in_hvac_seer_rating"].astype(str) +) -metadata_w_upgrades["in_hvac_cooling_efficiency"] = metadata_w_upgrades["in_hvac_cooling_efficiency"].apply(util_datagen.extract_cooling_efficiency) +metadata_w_upgrades["in_hvac_cooling_efficiency"] = metadata_w_upgrades[ + "in_hvac_cooling_efficiency" +].apply(util_datagen.extract_cooling_efficiency) # COMMAND ---------- ## Convert to SparkDF and write to directory -table_name = 'metadata_w_upgrades' -database_name = 'building_model' +table_name = "metadata_w_upgrades" +database_name = "building_model" -path = database_name + '.' + table_name +path = database_name + "." + table_name metadata_w_upgrades = spark.createDataFrame(metadata_w_upgrades) -metadata_w_upgrades.write.saveAsTable(name = path, mode = 'overwrite') +metadata_w_upgrades.write.saveAsTable(name=path, mode="overwrite") # COMMAND ---------- - - diff --git a/deprecated/Yearly_model/Resstock_model.py b/deprecated/Yearly_model/Resstock_model.py index 4c873dd..4e22618 100644 --- a/deprecated/Yearly_model/Resstock_model.py +++ b/deprecated/Yearly_model/Resstock_model.py @@ -2,16 +2,16 @@ # MAGIC %md # Creating Feed Forward NN model for ResStock data # MAGIC # MAGIC ### Goal -# MAGIC Create and experiment with FF NN models for ResStock data. All preprocessing layers (normalization and one hot encoding) part of tf model so can save the model and apply directly to a test dataset. +# MAGIC Create and experiment with FF NN models for ResStock data. All preprocessing layers (normalization and one hot encoding) part of tf model so can save the model and apply directly to a test dataset. # MAGIC # MAGIC ### Process -# MAGIC Loads the full data with upgrades and weather attached. Loads all data in memory so can only do Monthly or Yearly aggregation for now. +# MAGIC Loads the full data with upgrades and weather attached. Loads all data in memory so can only do Monthly or Yearly aggregation for now. # MAGIC -# MAGIC ##### Inputs: +# MAGIC ##### Inputs: # MAGIC - `building_model.resstock_yearly_with_metadata_weather_upgrades`: Contains the full data with upgrades and weather attached. All preprocessing has been applied already (besides normalization and one hot encoding). # MAGIC -# MAGIC ##### Outputs: -# MAGIC - `saved_model`: A saved tf object containing the model including all preprocessing layers. +# MAGIC ##### Outputs: +# MAGIC - `saved_model`: A saved tf object containing the model including all preprocessing layers. # MAGIC # MAGIC ### TODOs: # MAGIC @@ -21,7 +21,7 @@ # MAGIC #### Future Work # MAGIC - Add some analysis on future importance # MAGIC - Build model using difference from baseline as response -# MAGIC - Add model using multiple input variables. +# MAGIC - Add model using multiple input variables. # MAGIC # MAGIC @@ -31,6 +31,7 @@ import pyspark.sql.functions as F import pandas as pd import numpy as np + # import mlflow import tensorflow as tf import itertools @@ -42,6 +43,7 @@ from tensorflow.keras.layers import Dense, BatchNormalization, InputLayer, Input from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.layers import Normalization, StringLookup, CategoryEncoding + # from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, SparkTrials from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Dense @@ -56,46 +58,88 @@ import os + # fix cublann OOM -os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' +os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true" tf.config.list_physical_devices("GPU") # COMMAND ---------- -full_data_path = 'building_model.resstock_yearly_with_metadata_weather_upgrades' +full_data_path = "building_model.resstock_yearly_with_metadata_weather_upgrades" resstock_yearly_with_metadata_weather = spark.table(full_data_path) -resstock_yearly_with_metadata_weather_df = resstock_yearly_with_metadata_weather.where(F.col('upgrade_id') == 0).toPandas() +resstock_yearly_with_metadata_weather_df = resstock_yearly_with_metadata_weather.where( + F.col("upgrade_id") == 0 +).toPandas() data = resstock_yearly_with_metadata_weather_df.copy() # COMMAND ---------- -data['sum_out_heating_total'] = data.sum_out_electricity_heating_total + data.sum_out_natural_gas_heating_total + data.sum_out_fuel_oil_heating_total + data.sum_out_propane_heating_total +data["sum_out_heating_total"] = ( + data.sum_out_electricity_heating_total + + data.sum_out_natural_gas_heating_total + + data.sum_out_fuel_oil_heating_total + + data.sum_out_propane_heating_total +) # COMMAND ---------- ## let's use only one output variable for now -target_variable = ['sum_out_electricity_cooling_total', 'sum_out_heating_total'] - -additional = ['in_insulation_ceiling', 'in_insulation_floor', 'in_insulation_foundation_wall', 'in_insulation_rim_joist', 'in_insulation_roof', 'in_insulation_slab', - 'in_insulation_wall', 'in_cooling_setpoint', 'in_heating_setpoint', 'in_cooling_setpoint_has_offset', - 'in_cooling_setpoint_offset_magnitude', 'in_heating_setpoint_offset_magnitude', 'in_heating_setpoint_has_offset', - ] - -covariates = ['in_occupants', 'temp_high', 'temp_low', 'temp_avg', - 'wind_speed_avg', 'ghi_avg', 'dni_avg', 'dhi_avg', 'std_temp_high', - 'std_temp_low', 'std_wind_speed', 'std_ghi', 'in_vintage', 'in_sqft', 'in_hvac_heating_efficiency_nominal_percent', 'in_infiltration_ach50', - 'in_window_wall_ratio_mean', 'in_bedrooms', 'in_geometry_stories', 'in_ashrae_iecc_climate_zone_2004','in_income_bin_midpoint', - 'in_hvac_cooling_type', 'in_hvac_cooling_efficiency', 'in_hvac_cooling_partial_space_conditioning', 'in_is_vacant', 'in_is_rented', 'in_hvac_has_ducts', 'in_hvac_backup_heating_efficiency_nominal_percent', 'in_heating_fuel'] + additional - - +target_variable = ["sum_out_electricity_cooling_total", "sum_out_heating_total"] + +additional = [ + "in_insulation_ceiling", + "in_insulation_floor", + "in_insulation_foundation_wall", + "in_insulation_rim_joist", + "in_insulation_roof", + "in_insulation_slab", + "in_insulation_wall", + "in_cooling_setpoint", + "in_heating_setpoint", + "in_cooling_setpoint_has_offset", + "in_cooling_setpoint_offset_magnitude", + "in_heating_setpoint_offset_magnitude", + "in_heating_setpoint_has_offset", +] + +covariates = [ + "in_occupants", + "temp_high", + "temp_low", + "temp_avg", + "wind_speed_avg", + "ghi_avg", + "dni_avg", + "dhi_avg", + "std_temp_high", + "std_temp_low", + "std_wind_speed", + "std_ghi", + "in_vintage", + "in_sqft", + "in_hvac_heating_efficiency_nominal_percent", + "in_infiltration_ach50", + "in_window_wall_ratio_mean", + "in_bedrooms", + "in_geometry_stories", + "in_ashrae_iecc_climate_zone_2004", + "in_income_bin_midpoint", + "in_hvac_cooling_type", + "in_hvac_cooling_efficiency", + "in_hvac_cooling_partial_space_conditioning", + "in_is_vacant", + "in_is_rented", + "in_hvac_has_ducts", + "in_hvac_backup_heating_efficiency_nominal_percent", + "in_heating_fuel", +] + additional # COMMAND ---------- - # COMMAND ---------- # Assume 'data', 'covariates', and 'target_variable' are predefined @@ -106,12 +150,14 @@ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=40) # Separate out the numeric and categorical feature names -cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist() -num_features = X_train.select_dtypes(exclude=['object', 'category', 'bool']).columns.tolist() -bool_features = X_train.select_dtypes(include=['bool']).columns.tolist() +cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist() +num_features = X_train.select_dtypes( + exclude=["object", "category", "bool"] +).columns.tolist() +bool_features = X_train.select_dtypes(include=["bool"]).columns.tolist() -# Initialize input layers and preprocessing layers for all features. +# Initialize input layers and preprocessing layers for all features. inputs = {} preprocessed = [] @@ -125,17 +171,24 @@ preprocessed = [] for feature in cat_features: # Create an input layer for the categorical feature - feature_input = Input(shape=(1,), name=feature, dtype='string') + feature_input = Input(shape=(1,), name=feature, dtype="string") inputs[feature] = feature_input - + # Create a StringLookup layer with the precomputed vocabulary # Note: Add an OOV token if your model needs to handle unseen categories - lookup_layer = StringLookup(vocabulary=vocabularies[feature], output_mode='int', mask_token=None, oov_token='[UNK]') + lookup_layer = StringLookup( + vocabulary=vocabularies[feature], + output_mode="int", + mask_token=None, + oov_token="[UNK]", + ) indexed_data = lookup_layer(feature_input) - + # Create a CategoryEncoding layer for one-hot encoding using the size of the vocabulary # Add 1 to account for the OOV token if used - one_hot_layer = CategoryEncoding(num_tokens=len(vocabularies[feature]) + 1, output_mode='one_hot') + one_hot_layer = CategoryEncoding( + num_tokens=len(vocabularies[feature]) + 1, output_mode="one_hot" + ) one_hot_data = one_hot_layer(indexed_data) preprocessed.append(one_hot_data) @@ -143,25 +196,27 @@ # Calculate mean and variance for the feature from the training set. This is much faster than having tf calculate it feature_mean = X_train[feature].mean() feature_variance = X_train[feature].var() - + # Create a Normalization layer for the feature - normalizer = Normalization(axis=None, mean = feature_mean, variance = feature_variance, name=f'norm_{feature}') - + normalizer = Normalization( + axis=None, mean=feature_mean, variance=feature_variance, name=f"norm_{feature}" + ) + # Directly set the weights of the Normalization layer to the precomputed statistics - # Note: Normalization expects the variance in the second position, not the standard deviation + # Note: Normalization expects the variance in the second position, not the standard deviation # Create the corresponding input layer - feature_input = Input(shape=(1,), name=feature, dtype='float32') - + feature_input = Input(shape=(1,), name=feature, dtype="float32") + # Apply the Normalization layer to the input layer normalized_feature = normalizer(feature_input) - + # Store the input and processed features inputs[feature] = feature_input preprocessed.append(normalized_feature) # Boolean features for feature in bool_features: - inputs[feature] = Input(shape=(1,), name=feature, dtype='float32') + inputs[feature] = Input(shape=(1,), name=feature, dtype="float32") preprocessed.append(inputs[feature]) # Combine preprocessed inputs @@ -172,33 +227,33 @@ # Build the rest of the neural network layers on top of the preprocessed inputs -x = Dense(256, activation='relu')(all_preprocessed_inputs) +x = Dense(256, activation="relu")(all_preprocessed_inputs) x = BatchNormalization()(x) -x = Dense(128, activation='relu')(x) +x = Dense(128, activation="relu")(x) x = BatchNormalization()(x) -x = Dense(64, activation='relu')(x) +x = Dense(64, activation="relu")(x) x = BatchNormalization()(x) -x = Dense(32, activation='relu')(x) +x = Dense(32, activation="relu")(x) x = BatchNormalization()(x) -x = Dense(16, activation='relu')(x) +x = Dense(16, activation="relu")(x) x = BatchNormalization()(x) -x = Dense(16, activation='relu')(x) +x = Dense(16, activation="relu")(x) x = BatchNormalization()(x) -x = Dense(16, activation='relu')(x) +x = Dense(16, activation="relu")(x) x = BatchNormalization()(x) -x = Dense(16, activation='relu')(x) +x = Dense(16, activation="relu")(x) x = BatchNormalization()(x) -output = Dense(2, activation='linear')(x) +output = Dense(2, activation="linear")(x) # Create and compile the Model model = Model(inputs=list(inputs.values()), outputs=output) -model.compile(optimizer='adam', - loss="mae") +model.compile(optimizer="adam", loss="mae") + # Define the df_to_dataset function for converting DataFrames to tf.data.Dataset -# turn off shuffling. Not needed and changes the order of the data which becomes +# turn off shuffling. Not needed and changes the order of the data which becomes # a problem in our evaluation code later. def df_to_dataset(features, labels, shuffle=False, batch_size=128): dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels)) @@ -206,13 +261,18 @@ def df_to_dataset(features, labels, shuffle=False, batch_size=128): dataset = dataset.shuffle(buffer_size=len(features)) return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE) + # Prepare and train the model batch_size = 128 -#split train set into test and val, and use what was called the "val" set as the test set -train_size = int(X_train.shape[0]*.8) -train_ds = df_to_dataset(X_train[:train_size], y_train[:train_size], shuffle=False, batch_size=batch_size) -val_ds = df_to_dataset(X_train[train_size:], y_train[train_size:], batch_size=batch_size) +# split train set into test and val, and use what was called the "val" set as the test set +train_size = int(X_train.shape[0] * 0.8) +train_ds = df_to_dataset( + X_train[:train_size], y_train[:train_size], shuffle=False, batch_size=batch_size +) +val_ds = df_to_dataset( + X_train[train_size:], y_train[train_size:], batch_size=batch_size +) test_ds = df_to_dataset(X_val, y_val, batch_size=batch_size) @@ -231,18 +291,18 @@ def on_train_begin(self, logs=None): def on_epoch_end(self, epoch, logs=None): # Logs is a dictionary. We save the losses at the end of each epoch. - self.train_losses.append(logs.get('loss')) - self.val_losses.append(logs.get('val_loss')) # If you have validation data + self.train_losses.append(logs.get("loss")) + self.val_losses.append(logs.get("val_loss")) # If you have validation data def on_train_end(self, logs=None): # This function is called at the end of training. # Plot the losses. plt.figure() - plt.plot(self.train_losses, label='Training loss') - plt.plot(self.val_losses, label='Validation loss') - plt.title('Training and Validation Loss') - plt.xlabel('Epochs') - plt.ylabel('Loss') + plt.plot(self.train_losses, label="Training loss") + plt.plot(self.val_losses, label="Validation loss") + plt.title("Training and Validation Loss") + plt.xlabel("Epochs") + plt.ylabel("Loss") plt.legend() plt.show() @@ -259,117 +319,158 @@ def on_train_end(self, logs=None): h = model.fit( train_ds, epochs=100, - verbose = 2, + verbose=2, validation_data=val_ds, callbacks=[early_stopping, history], ) # COMMAND ---------- -predictions = np.clip(model.predict(test_ds), a_min = 0, a_max = None) +predictions = np.clip(model.predict(test_ds), a_min=0, a_max=None) # get correct loss on training data. mae = model.evaluate(test_ds) # COMMAND ---------- -# Create table for comparsion with other methods + +# Create table for comparsion with other methods @udf(returnType=FloatType()) -def get_percent_error(pred:float, true:float) -> float: +def get_percent_error(pred: float, true: float) -> float: if true == 0: return None - return abs(pred - true)/true + return abs(pred - true) / true + df_true = pd.DataFrame(y_val).reset_index() -df_true.columns = ['building_id', 'cooling', 'heating'] -df_true['hvac'] = df_true.cooling + df_true.heating +df_true.columns = ["building_id", "cooling", "heating"] +df_true["hvac"] = df_true.cooling + df_true.heating df_pred = pd.DataFrame(predictions) -df_pred.columns = ['cooling', 'heating'] -df_pred['building_id'] = y_val.index.values -df_pred['hvac'] = df_pred.cooling + df_pred.heating - -df_y = spark.createDataFrame(df_true.melt(id_vars = 'building_id').merge(df_pred.melt(id_vars = 'building_id'), on = ['building_id', 'variable'], suffixes = ['_true', '_pred'])) +df_pred.columns = ["cooling", "heating"] +df_pred["building_id"] = y_val.index.values +df_pred["hvac"] = df_pred.cooling + df_pred.heating + +df_y = spark.createDataFrame( + df_true.melt(id_vars="building_id").merge( + df_pred.melt(id_vars="building_id"), + on=["building_id", "variable"], + suffixes=["_true", "_pred"], + ) +) -df_metadata = spark.createDataFrame(X_val_pd.reset_index().rename(columns= {'index' : 'building_id'})[['building_id', 'in_heating_fuel', 'in_hvac_cooling_type']]) +df_metadata = spark.createDataFrame( + X_val_pd.reset_index().rename(columns={"index": "building_id"})[ + ["building_id", "in_heating_fuel", "in_hvac_cooling_type"] + ] +) df_eval = ( - df_metadata - .join(df_y.withColumnRenamed('variable', 'end_use'), on = ['building_id']) - .replace({'AC' : 'Central AC'}, subset = 'in_hvac_cooling_type') - .withColumn('in_heating_fuel', - F.when(F.col('in_hvac_cooling_type') == 'Heat Pump', F.lit('Heat Pump')) - .otherwise(F.col('in_heating_fuel'))) - .withColumn('type', - F.when(F.col('end_use') == 'cooling', F.col('in_hvac_cooling_type')) - .when(F.col('end_use') == 'heating', F.col('in_heating_fuel')) - .otherwise(F.lit('Total'))) - .withColumn('abs_error', F.abs(F.col('value_pred') - F.col('value_true'))) - .withColumn('percent_error', get_percent_error(F.col('value_pred'), F.col('value_true'))) + df_metadata.join(df_y.withColumnRenamed("variable", "end_use"), on=["building_id"]) + .replace({"AC": "Central AC"}, subset="in_hvac_cooling_type") + .withColumn( + "in_heating_fuel", + F.when( + F.col("in_hvac_cooling_type") == "Heat Pump", F.lit("Heat Pump") + ).otherwise(F.col("in_heating_fuel")), + ) + .withColumn( + "type", + F.when(F.col("end_use") == "cooling", F.col("in_hvac_cooling_type")) + .when(F.col("end_use") == "heating", F.col("in_heating_fuel")) + .otherwise(F.lit("Total")), + ) + .withColumn("abs_error", F.abs(F.col("value_pred") - F.col("value_true"))) + .withColumn( + "percent_error", get_percent_error(F.col("value_pred"), F.col("value_true")) + ) ) -def get_error_metric_table(df, groupby_cols = []): - df_metrics = ( - df - .groupby(*groupby_cols) - .agg( - F.mean('abs_error').alias('Mean Abs Error'), - F.median('abs_error').alias('Median Abs Error'), - (F.median('percent_error')*100).alias('Median APE'), - (F.mean('percent_error')*100).alias('MAPE'), - ) +def get_error_metric_table(df, groupby_cols=[]): + df_metrics = df.groupby(*groupby_cols).agg( + F.mean("abs_error").alias("Mean Abs Error"), + F.median("abs_error").alias("Median Abs Error"), + (F.median("percent_error") * 100).alias("Median APE"), + (F.mean("percent_error") * 100).alias("MAPE"), ) return df_metrics -metrics_by_end_use_type = get_error_metric_table(df = df_eval.where(F.col('end_use') != 'hvac'), groupby_cols = ['end_use' ,'type']) -metrics_by_end_use = get_error_metric_table(df = df_eval, groupby_cols = ['end_use']) .withColumn('type', F.lit('Total')) + +metrics_by_end_use_type = get_error_metric_table( + df=df_eval.where(F.col("end_use") != "hvac"), groupby_cols=["end_use", "type"] +) +metrics_by_end_use = get_error_metric_table( + df=df_eval, groupby_cols=["end_use"] +).withColumn("type", F.lit("Total")) df_metrics_combined = metrics_by_end_use_type.unionByName(metrics_by_end_use).toPandas() -df_metrics_combined.to_csv('gs://the-cube/export/surrogate_model_metrics/feed_forward.csv', index=False) +df_metrics_combined.to_csv( + "gs://the-cube/export/surrogate_model_metrics/feed_forward.csv", index=False +) # COMMAND ---------- -def get_results(data_sub, predictions, groupby_cols = ['upgrade_id']): - X_train_df, X_test_df, y_train, y_test = train_test_split(data_sub, data_sub[target_variable], test_size=0.2, random_state=40) - comparison = pd.concat([pd.DataFrame({"Predicted": predictions[:,0], "Actual": y_test.iloc[:,0]}), X_test_df], axis = 1) - comparison['Error'] = comparison["Predicted"] - comparison["Actual"] - comparison['Abs Error'] = np.abs(comparison['Error']) - comparison['APE'] = (comparison['Abs Error']/comparison['Actual']).replace([np.inf, -np.inf], np.nan) +def get_results(data_sub, predictions, groupby_cols=["upgrade_id"]): + X_train_df, X_test_df, y_train, y_test = train_test_split( + data_sub, data_sub[target_variable], test_size=0.2, random_state=40 + ) + comparison = pd.concat( + [ + pd.DataFrame({"Predicted": predictions[:, 0], "Actual": y_test.iloc[:, 0]}), + X_test_df, + ], + axis=1, + ) + + comparison["Error"] = comparison["Predicted"] - comparison["Actual"] + comparison["Abs Error"] = np.abs(comparison["Error"]) + comparison["APE"] = (comparison["Abs Error"] / comparison["Actual"]).replace( + [np.inf, -np.inf], np.nan + ) + + comparison_agg = comparison.groupby(groupby_cols).agg( + { + "Error": ["mean", "median"], + "Abs Error": ["mean", "median", "sum"], + "APE": ["mean", "median"], + "Actual": ["sum"], + } + ) - comparison_agg = comparison.groupby(groupby_cols).agg({ - "Error" : ["mean", 'median'], - "Abs Error" : ["mean", 'median', 'sum'], - "APE" : ['mean', 'median'], - "Actual" : ['sum'], - }) + # comparison_agg['WAPE'] = comparison_agg['Abs Error','sum']/comparison_agg['Actual', 'sum'] + return comparison_agg.drop([("Abs Error", "sum"), "Actual"], axis=1) - #comparison_agg['WAPE'] = comparison_agg['Abs Error','sum']/comparison_agg['Actual', 'sum'] - return comparison_agg.drop([('Abs Error','sum'), 'Actual'],axis=1) # COMMAND ---------- ## view error by grouping variable y = y_val comparison = pd.DataFrame({"Predicted": np.hstack(predictions), "Actual": y_val}) -comparison['abs_error'] = np.abs(comparison["Predicted"] - comparison["Actual"]) -comparison['error'] = comparison["Predicted"] - comparison["Actual"] +comparison["abs_error"] = np.abs(comparison["Predicted"] - comparison["Actual"]) +comparison["error"] = comparison["Predicted"] - comparison["Actual"] actuals_and_preds = pd.concat([X_val, comparison], axis=1) comparison.index = X_val.index ## Group by any characteristic and view the error -grouping_variable = ['in_hvac_cooling_type'] +grouping_variable = ["in_hvac_cooling_type"] average_error = actuals_and_preds.groupby(grouping_variable)["error"].mean() average_value = actuals_and_preds.groupby(grouping_variable)["Actual"].mean() average_abs_error = actuals_and_preds.groupby(grouping_variable)["abs_error"].mean() -average_prediction= actuals_and_preds.groupby(grouping_variable)["Predicted"].mean() +average_prediction = actuals_and_preds.groupby(grouping_variable)["Predicted"].mean() -WMAPE = average_abs_error/average_value -WMPE = average_error/average_value +WMAPE = average_abs_error / average_value +WMPE = average_error / average_value # Create a dictionary with arrays as values and names as keys -results = {"average_error": average_error, "average_abs_error": average_abs_error, "average_value": average_value, "average_prediction": average_prediction, - "WMAPE": WMAPE, "WMPE": WMPE} +results = { + "average_error": average_error, + "average_abs_error": average_abs_error, + "average_value": average_value, + "average_prediction": average_prediction, + "WMAPE": WMAPE, + "WMPE": WMPE, +} # Create a DataFrame from the dictionary results = pd.DataFrame(results) @@ -379,7 +480,7 @@ def get_results(data_sub, predictions, groupby_cols = ['upgrade_id']): # COMMAND ---------- # Save the entire end-to-end model, including preprocessing layers -model.save('test_model_with_preprocessing') +model.save("test_model_with_preprocessing") # COMMAND ---------- @@ -388,9 +489,9 @@ def get_results(data_sub, predictions, groupby_cols = ['upgrade_id']): # COMMAND ---------- + ## lets make a function to build a model according to certain hyperparameters def build_model(hparams): - # precompute vocabularies for all features. orders of magnitude faster than having tf determine vocabularies. vocabularies = {} for feature in cat_features: @@ -401,17 +502,24 @@ def build_model(hparams): preprocessed = [] for feature in cat_features: # Create an input layer for the categorical feature - feature_input = Input(shape=(1,), name=feature, dtype='string') + feature_input = Input(shape=(1,), name=feature, dtype="string") inputs[feature] = feature_input - + # Create a StringLookup layer with the precomputed vocabulary # Note: Add an OOV token if your model needs to handle unseen categories - lookup_layer = StringLookup(vocabulary=vocabularies[feature], output_mode='int', mask_token=None, oov_token='[UNK]') + lookup_layer = StringLookup( + vocabulary=vocabularies[feature], + output_mode="int", + mask_token=None, + oov_token="[UNK]", + ) indexed_data = lookup_layer(feature_input) - + # Create a CategoryEncoding layer for one-hot encoding using the size of the vocabulary # Add 1 to account for the OOV token if used - one_hot_layer = CategoryEncoding(num_tokens=len(vocabularies[feature]) + 1, output_mode='one_hot') + one_hot_layer = CategoryEncoding( + num_tokens=len(vocabularies[feature]) + 1, output_mode="one_hot" + ) one_hot_data = one_hot_layer(indexed_data) preprocessed.append(one_hot_data) @@ -419,103 +527,118 @@ def build_model(hparams): # Calculate mean and variance for the feature from the training set. This is much faster than having tf calculate it feature_mean = X_train[feature].mean() feature_variance = X_train[feature].var() - + # Create a Normalization layer for the feature - normalizer = Normalization(axis=None, mean = feature_mean, variance = feature_variance, name=f'norm_{feature}') - + normalizer = Normalization( + axis=None, + mean=feature_mean, + variance=feature_variance, + name=f"norm_{feature}", + ) + # Directly set the weights of the Normalization layer to the precomputed statistics - # Note: Normalization expects the variance in the second position, not the standard deviation + # Note: Normalization expects the variance in the second position, not the standard deviation # Create the corresponding input layer - feature_input = Input(shape=(1,), name=feature, dtype='float32') - + feature_input = Input(shape=(1,), name=feature, dtype="float32") + # Apply the Normalization layer to the input layer normalized_feature = normalizer(feature_input) - + # Store the input and processed features inputs[feature] = feature_input preprocessed.append(normalized_feature) # Boolean features for feature in bool_features: - inputs[feature] = Input(shape=(1,), name=feature, dtype='float32') + inputs[feature] = Input(shape=(1,), name=feature, dtype="float32") preprocessed.append(inputs[feature]) # Combine preprocessed inputs all_preprocessed_inputs = tf.keras.layers.concatenate(preprocessed) - # Build neural network layers based on hyperparameters x = all_preprocessed_inputs - for _ in range(hparams['num_layers']): - x = Dense(int(hparams['units_per_layer']), activation='relu')(x) - if hparams['batch_norm']: + for _ in range(hparams["num_layers"]): + x = Dense(int(hparams["units_per_layer"]), activation="relu")(x) + if hparams["batch_norm"]: x = BatchNormalization()(x) - - outputs = Dense(1, activation='linear')(x) # Adjust based on your specific problem - + + outputs = Dense(1, activation="linear")(x) # Adjust based on your specific problem + # Create the model model = Model(inputs=list(inputs.values()), outputs=outputs) - + # Compile the model using the learning rate from hyperparameters lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( - initial_learning_rate=hparams['learning_rate'], # Use the hyperparameter + initial_learning_rate=hparams["learning_rate"], # Use the hyperparameter decay_steps=10000, - decay_rate=hparams['learning_rate_decay'], - staircase=True) - - model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), loss='mae') - + decay_rate=hparams["learning_rate_decay"], + staircase=True, + ) + + model.compile( + optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule), loss="mae" + ) + return model # COMMAND ---------- -# Assuming 'data', 'covariates', and 'target_variable' are predefined. We will also be using the preprocessing and -# data prepare dataset functionality from earlier to generate our batches. +# Assuming 'data', 'covariates', and 'target_variable' are predefined. We will also be using the preprocessing and +# data prepare dataset functionality from earlier to generate our batches. X = data[covariates] y = data[target_variable] # Split the original DataFrame into train and validation sets X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=40) # Separate out the numeric and categorical feature names -cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist() -num_features = X_train.select_dtypes(exclude=['object', 'category', 'bool']).columns.tolist() -bool_features = X_train.select_dtypes(include=['bool']).columns.tolist() - +cat_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist() +num_features = X_train.select_dtypes( + exclude=["object", "category", "bool"] +).columns.tolist() +bool_features = X_train.select_dtypes(include=["bool"]).columns.tolist() # Define the objective function for Hyperopt + def objective(params): model = build_model(params) - early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True) - history = model.fit(train_ds, validation_data=val_ds, epochs=3, callbacks=[early_stop], verbose=0) - best_val_loss = min(history.history['val_loss']) - return {'loss': best_val_loss, 'status': STATUS_OK} + early_stop = tf.keras.callbacks.EarlyStopping( + monitor="val_loss", patience=1, restore_best_weights=True + ) + history = model.fit( + train_ds, validation_data=val_ds, epochs=3, callbacks=[early_stop], verbose=0 + ) + best_val_loss = min(history.history["val_loss"]) + return {"loss": best_val_loss, "status": STATUS_OK} + # Define the hyperparameter space space = { - 'num_layers': hp.quniform('num_layers', 5, 8, 1), - 'units': hp.quniform('units', 32, 512, 32), - 'learning_rate': hp.loguniform('learning_rate', np.log(1e-4), np.log(1e-2)), - 'decay_rate': hp.uniform('decay_rate', 0.9, 0.99), - 'use_batch_norm': hp.choice('use_batch_norm', [False, True]) + "num_layers": hp.quniform("num_layers", 5, 8, 1), + "units": hp.quniform("units", 32, 512, 32), + "learning_rate": hp.loguniform("learning_rate", np.log(1e-4), np.log(1e-2)), + "decay_rate": hp.uniform("decay_rate", 0.9, 0.99), + "use_batch_norm": hp.choice("use_batch_norm", [False, True]), } # Run the optimization max_evals = 10 with mlflow.start_run(tags={"mlflow.runName": "Best Model Run"}): trials = SparkTrials(parallelism=2) - best_hyperparams = fmin(objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials) + best_hyperparams = fmin( + objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials + ) mlflow.log_params(best_hyperparams) - + # Rebuild and train the best model based on the best hyperparameters - best_hyperparams['num_layers'] = int(best_hyperparams['num_layers']) - best_hyperparams['units'] = int(best_hyperparams['units']) + best_hyperparams["num_layers"] = int(best_hyperparams["num_layers"]) + best_hyperparams["units"] = int(best_hyperparams["units"]) best_model = build_model(best_hyperparams) - + # Log the best model to MLflow mlflow.keras.log_model(best_model, "best_model") -print('Best hyperparameters:', best_model) - +print("Best hyperparameters:", best_model) diff --git a/deprecated/Yearly_model/Weather_gen.py b/deprecated/Yearly_model/Weather_gen.py index 62e034e..6a4c9fe 100644 --- a/deprecated/Yearly_model/Weather_gen.py +++ b/deprecated/Yearly_model/Weather_gen.py @@ -9,12 +9,12 @@ # MAGIC # MAGIC ### I/Os # MAGIC -# MAGIC ##### Inputs: +# MAGIC ##### Inputs: # MAGIC - `building_model.weather_files_metadata`: Contains the canonical_epw_filename and county_geoid # MAGIC - `building_model.weather_files_data`: Contains hourly weather data. Need to join with metadata to get county_geoids # MAGIC - `neighbors.project_upfront_cost_bucketed`: upfront costs by bucket and project # MAGIC -# MAGIC ##### Outputs: +# MAGIC ##### Outputs: # MAGIC - `building_model.weather_data_yearly`: weather features on yearly aggregation. Can also create monthly or daily aggregations in which case we will have building_model.weather_data_daily or building_model.weather_data_monthly # MAGIC # MAGIC @@ -25,122 +25,160 @@ import pyspark.sql.functions as F from pyspark.sql.functions import col from pyspark.sql.functions import avg + spark.conf.set("spark.sql.shuffle.partitions", 1536) # COMMAND ---------- -#load data -weather_metadata_path = 'building_model.weather_files_metadata' -weather_data_path = 'building_model.weather_files_data' +# load data +weather_metadata_path = "building_model.weather_files_metadata" +weather_data_path = "building_model.weather_files_data" weather_metadata = spark.table(weather_metadata_path) weather_data = spark.table(weather_data_path) - # COMMAND ---------- # select relevant features -relevant_features = ['temp_air', 'relative_humidity', 'wind_speed' , 'ghi', - 'dni', 'dhi', 'canonical_epw_filename', 'year', - 'month', 'day', 'hour'] +relevant_features = [ + "temp_air", + "relative_humidity", + "wind_speed", + "ghi", + "dni", + "dhi", + "canonical_epw_filename", + "year", + "month", + "day", + "hour", +] weather_data = weather_data.select(relevant_features) # convert celcius to farenheit -weather_data = weather_data.withColumn("temp_air", (F.col("temp_air") * (9/5)) + 32) +weather_data = weather_data.withColumn("temp_air", (F.col("temp_air") * (9 / 5)) + 32) # create features for CDD, HDD, heat pump switch threshold, and freezing point. -weather_data = weather_data.withColumn( - "below_32", F.when(F.col("temp_air") < 32, 1).otherwise(0) -).withColumn( - "below_41", F.when(F.col("temp_air") < 41, 1).otherwise(0) -).withColumn( - "HDD", F.when(F.col("temp_air") > 65, F.col("temp_air") - 65).otherwise(0) -).withColumn( - "CDD", F.when(F.col("temp_air") < 65, 65 - F.col("temp_air")).otherwise(0) +weather_data = ( + weather_data.withColumn("below_32", F.when(F.col("temp_air") < 32, 1).otherwise(0)) + .withColumn("below_41", F.when(F.col("temp_air") < 41, 1).otherwise(0)) + .withColumn( + "HDD", F.when(F.col("temp_air") > 65, F.col("temp_air") - 65).otherwise(0) + ) + .withColumn( + "CDD", F.when(F.col("temp_air") < 65, 65 - F.col("temp_air")).otherwise(0) + ) ) # COMMAND ---------- -def WeatherAggregation(weather_data = weather_data , weather_metadata = weather_metadata, aggregation_level = 'yearly'): - '''Create weather data for each county geoid for the given Time aggregation. - - Creates select aggregations for the selected features over the given temporal aggregation level. Aggregations - mostly consist of averages and standard deviations, where the standard deviations. We also compute the number of HDD and CDD as well - Args: - weather_data: A spark dataframe containing weather features - weather_metadata: A spark dataframe containing information mapping weather files to county_geoid - aggregation_level: String which can take values 'yearly' 'monthly', and 'daily' +def WeatherAggregation( + weather_data=weather_data, + weather_metadata=weather_metadata, + aggregation_level="yearly", +): + """Create weather data for each county geoid for the given Time aggregation. + + Creates select aggregations for the selected features over the given temporal aggregation level. Aggregations + mostly consist of averages and standard deviations, where the standard deviations. We also compute the number of HDD and CDD as well + + Args: + weather_data: A spark dataframe containing weather features + weather_metadata: A spark dataframe containing information mapping weather files to county_geoid + aggregation_level: String which can take values 'yearly' 'monthly', and 'daily' - Returns: - A spark dataframe for the given temporal aggregation. - ''' + Returns: + A spark dataframe for the given temporal aggregation. + """ weather_full_daily = ( - weather_data - .join(weather_metadata.select('canonical_epw_filename', 'county_geoid'), on = 'canonical_epw_filename') - ).groupBy('day','month', 'county_geoid').agg(F.max(col("temp_air")).alias('temp_high'), F.min(col("temp_air")).alias('temp_low'), - F.avg(col("temp_air")).alias('temp_avg'), F.avg(col("wind_speed")).alias('wind_speed_avg'), - F.avg(col("ghi")).alias('ghi_avg'), - F.avg(col("dni")).alias('dni_avg'), - F.avg(col("dhi")).alias('dhi_avg'), - F.avg(col('HDD')).alias('HDD'), - F.avg(col('CDD')).alias('CDD'), - F.sum(col('below_32')).alias('below_32'), - F.sum(col('below_41')).alias('below_41'), - ) - if aggregation_level == 'daily': + ( + weather_data.join( + weather_metadata.select("canonical_epw_filename", "county_geoid"), + on="canonical_epw_filename", + ) + ) + .groupBy("day", "month", "county_geoid") + .agg( + F.max(col("temp_air")).alias("temp_high"), + F.min(col("temp_air")).alias("temp_low"), + F.avg(col("temp_air")).alias("temp_avg"), + F.avg(col("wind_speed")).alias("wind_speed_avg"), + F.avg(col("ghi")).alias("ghi_avg"), + F.avg(col("dni")).alias("dni_avg"), + F.avg(col("dhi")).alias("dhi_avg"), + F.avg(col("HDD")).alias("HDD"), + F.avg(col("CDD")).alias("CDD"), + F.sum(col("below_32")).alias("below_32"), + F.sum(col("below_41")).alias("below_41"), + ) + ) + if aggregation_level == "daily": return weather_full_daily - if aggregation_level == 'monthly': + if aggregation_level == "monthly": weather_full_monthly = ( - weather_full_daily - ).groupBy('month', 'county_geoid').agg(F.avg(col("temp_high")).alias('temp_high'), F.avg(col("temp_low")).alias('temp_low'), - F.avg(col("temp_avg")).alias('temp_avg'), F.avg(col("wind_speed_avg")).alias('wind_speed_avg'), - F.avg(col("ghi_avg")).alias('ghi_avg'), - F.avg(col("dni_avg")).alias('dni_avg'), - F.avg(col("dhi_avg")).alias('dhi_avg'), - F.stddev(col("temp_high")).alias('std_temp_high'), - F.stddev(col("temp_low")).alias('std_temp_low'), - F.stddev(col("wind_speed_avg")).alias('std_wind_speed'), - F.stddev(col("ghi_avg")).alias('std_ghi'), - F.sum(col('HDD')).alias('HDD'), - F.sum(col('CDD')).alias('CDD'), - F.sum(col('below_41')).alias('below_41'), - F.sum(col('below_32')).alias('below_32'), - ) + (weather_full_daily) + .groupBy("month", "county_geoid") + .agg( + F.avg(col("temp_high")).alias("temp_high"), + F.avg(col("temp_low")).alias("temp_low"), + F.avg(col("temp_avg")).alias("temp_avg"), + F.avg(col("wind_speed_avg")).alias("wind_speed_avg"), + F.avg(col("ghi_avg")).alias("ghi_avg"), + F.avg(col("dni_avg")).alias("dni_avg"), + F.avg(col("dhi_avg")).alias("dhi_avg"), + F.stddev(col("temp_high")).alias("std_temp_high"), + F.stddev(col("temp_low")).alias("std_temp_low"), + F.stddev(col("wind_speed_avg")).alias("std_wind_speed"), + F.stddev(col("ghi_avg")).alias("std_ghi"), + F.sum(col("HDD")).alias("HDD"), + F.sum(col("CDD")).alias("CDD"), + F.sum(col("below_41")).alias("below_41"), + F.sum(col("below_32")).alias("below_32"), + ) + ) return weather_full_monthly - if aggregation_level == 'yearly': + if aggregation_level == "yearly": weather_full_yearly = ( - weather_full_daily - ).groupBy('county_geoid').agg(F.avg(col("temp_high")).alias('temp_high'), F.avg(col("temp_low")).alias('temp_low'), - F.avg(col("temp_avg")).alias('temp_avg'), F.avg(col("wind_speed_avg")).alias('wind_speed_avg'), - F.avg(col("ghi_avg")).alias('ghi_avg'), - F.avg(col("dni_avg")).alias('dni_avg'), - F.avg(col("dhi_avg")).alias('dhi_avg'), - F.stddev(col("temp_high")).alias('std_temp_high'), - F.stddev(col("temp_low")).alias('std_temp_low'), - F.stddev(col("wind_speed_avg")).alias('std_wind_speed'), - F.stddev(col("ghi_avg")).alias('std_ghi'), - F.sum(col('HDD')).alias('HDD'), - F.sum(col('CDD')).alias('CDD'), - F.sum(col('below_41')).alias('below_41'), - F.sum(col('below_32')).alias('below_32'), - ) + (weather_full_daily) + .groupBy("county_geoid") + .agg( + F.avg(col("temp_high")).alias("temp_high"), + F.avg(col("temp_low")).alias("temp_low"), + F.avg(col("temp_avg")).alias("temp_avg"), + F.avg(col("wind_speed_avg")).alias("wind_speed_avg"), + F.avg(col("ghi_avg")).alias("ghi_avg"), + F.avg(col("dni_avg")).alias("dni_avg"), + F.avg(col("dhi_avg")).alias("dhi_avg"), + F.stddev(col("temp_high")).alias("std_temp_high"), + F.stddev(col("temp_low")).alias("std_temp_low"), + F.stddev(col("wind_speed_avg")).alias("std_wind_speed"), + F.stddev(col("ghi_avg")).alias("std_ghi"), + F.sum(col("HDD")).alias("HDD"), + F.sum(col("CDD")).alias("CDD"), + F.sum(col("below_41")).alias("below_41"), + F.sum(col("below_32")).alias("below_32"), + ) + ) return weather_full_yearly + # COMMAND ---------- -aggregation_level = 'yearly' +aggregation_level = "yearly" -weather_data_full = WeatherAggregation(weather_data = weather_data , weather_metadata = weather_metadata, aggregation_level = aggregation_level) +weather_data_full = WeatherAggregation( + weather_data=weather_data, + weather_metadata=weather_metadata, + aggregation_level=aggregation_level, +) -table_name = 'weather_data_' + aggregation_level -database_name = 'building_model' +table_name = "weather_data_" + aggregation_level +database_name = "building_model" -path = database_name + '.' + table_name +path = database_name + "." + table_name weather_data_full.write.saveAsTable(path) # COMMAND ---------- - - diff --git a/deprecated/Yearly_model/util_datagen.py b/deprecated/Yearly_model/util_datagen.py index 6a5afd1..068d0ea 100644 --- a/deprecated/Yearly_model/util_datagen.py +++ b/deprecated/Yearly_model/util_datagen.py @@ -1,79 +1,80 @@ - import itertools import math import re from typing import Dict import re import pandas as pd + # Define BTU_PER_WH if not already defined BTU_PER_WH = 3.412 # Example value, adjust as necessary -SEER_TO_EER = .875 +SEER_TO_EER = 0.875 def convert_heating_efficiency(value): """convert HSPF to percentage or extract percentage directly""" - if 'HSPF' in value: + if "HSPF" in value: # Extract the numeric value and convert HSPF to percentage - hspf_value = float(re.search(r'(\d+\.?\d+) HSPF', value).group(1)) + hspf_value = float(re.search(r"(\d+\.?\d+) HSPF", value).group(1)) return hspf_value * 100 / BTU_PER_WH else: # Extract percentage directly if present - match = re.search(r'(\d+\.?\d+)%', value) + match = re.search(r"(\d+\.?\d+)%", value) return float(match.group(1)) if match else None + # Extract SEER, HSPF, and AFUE ratings from 'in_hvac_heating_efficiency' def extract_seer(value): - """Extract SEER ratings from 'in_hvac_heating_efficiency' values """ - parts = value.split(', ') - if len(parts) > 1 and 'SEER' in parts[1]: - seer_str = value.split(', ')[1] # Extract the SEER substring - seer_rating = seer_str.split(' ')[1] # Extract the SEER rating value + """Extract SEER ratings from 'in_hvac_heating_efficiency' values""" + parts = value.split(", ") + if len(parts) > 1 and "SEER" in parts[1]: + seer_str = value.split(", ")[1] # Extract the SEER substring + seer_rating = seer_str.split(" ")[1] # Extract the SEER rating value return float(seer_rating) return None + def extract_hspf(value): - """Extract HSPF ratings from 'in_hvac_heating_efficiency' values """ - parts = value.split(', ') - if len(parts) > 2 and 'HSPF' in parts[2]: - return float(parts[2].split(' HSPF')[0]) + """Extract HSPF ratings from 'in_hvac_heating_efficiency' values""" + parts = value.split(", ") + if len(parts) > 2 and "HSPF" in parts[2]: + return float(parts[2].split(" HSPF")[0]) return None + def extract_afue(value): - """Extract AFUE ratings from 'in_hvac_heating_efficiency' values """ - parts = value.split(', ') - if len(parts) > 1 and '%' in parts[1]: - return float(parts[1].split('%')[0]) + """Extract AFUE ratings from 'in_hvac_heating_efficiency' values""" + parts = value.split(", ") + if len(parts) > 1 and "%" in parts[1]: + return float(parts[1].split("%")[0]) return None def extract_cooling_efficiency(text): - """ Converts SEER ratings to EER values """ - if pd.isna(text): - return 99 - match = re.match(r"((?:SEER|EER))\s+([\d\.]+)", text) - if match: - efficiency_type, value = match.groups() - if efficiency_type == "SEER": - value = float(value) * SEER_TO_EER + """Converts SEER ratings to EER values""" + if pd.isna(text): + return 99 + match = re.match(r"((?:SEER|EER))\s+([\d\.]+)", text) + if match: + efficiency_type, value = match.groups() + if efficiency_type == "SEER": + value = float(value) * SEER_TO_EER + else: + value = float(value) + return value else: - value = float(value) - return value - else: - return 99 + return 99 + def vintage2age2010(vintage: str) -> int: - """ vintage of the building in the year of 2010 + """vintage of the building in the year of 2010 >>> vintage2age2000('<1940') 80 >>> vintage2age2000('1960s') 50 """ vintage = vintage.strip() - if vintage.startswith('<'): # '<1940' bin in resstock + if vintage.startswith("<"): # '<1940' bin in resstock return 80 else: return 2010 - int(vintage[:4]) - - - diff --git a/deprecated/infra_agnostic/datagen.py b/deprecated/infra_agnostic/datagen.py index 23af4b8..98e4632 100644 --- a/deprecated/infra_agnostic/datagen.py +++ b/deprecated/infra_agnostic/datagen.py @@ -14,10 +14,10 @@ # Constants EER_CONVERSION = { - 'EER': 1.0, - 'SEER': .875, - 'SEER2': 0.91, # ~=SEER*1.04 (https://www.marathonhvac.com/seer-to-seer2/) - 'EER2': 1.04 + "EER": 1.0, + "SEER": 0.875, + "SEER2": 0.91, # ~=SEER*1.04 (https://www.marathonhvac.com/seer-to-seer2/) + "EER2": 1.04, } BTU_PER_WH = 3.413 HOURS_IN_A_YEAR = 8760 # 24*365, assuming a non-leap year @@ -26,13 +26,13 @@ # '1': (13, 30) means units in climate zones 1A (1-anything) with R13 insulation # or less are upgraded to R30 BASIC_ENCLOSURE_INSULATION = { - '1': (13, 30), - '2': (30, 49), - '3': (30, 49), - '4': (38, 60), - '5': (38, 60), - '6': (38, 60), - '7': (38, 60), + "1": (13, 30), + "2": (30, 49), + "3": (30, 49), + "4": (38, 60), + "5": (38, 60), + "6": (38, 60), + "7": (38, 60), } # Path to ResStock dataset @@ -40,9 +40,9 @@ # to access gs:// paths without explicitly providing credentials, run # `gcloud auth application-default login` (only required once) RESSTOCK_PATH = os.environ.get( - 'SURROGATE_MODELING_RESSTOCK_PATH', - 'gs://the-cube/data/raw/nrel/end_use_load_profiles/2022/' - 'resstock_tmy3_release_1/' + "SURROGATE_MODELING_RESSTOCK_PATH", + "gs://the-cube/data/raw/nrel/end_use_load_profiles/2022/" + "resstock_tmy3_release_1/", ) # Filesystem cache layer path. @@ -50,14 +50,20 @@ # datagen, or (better) use `python-dotenv` # Cache path is only evaluated once and thus cannot be changed after import # without reloading (`importlib.reload(datagen)`) -CACHE_PATH = os.environ.get('SURROGATE_MODELING_CACHE_PATH', '.cache') +CACHE_PATH = os.environ.get("SURROGATE_MODELING_CACHE_PATH", ".cache") if not os.path.isdir(CACHE_PATH): logging.warning(f"Cache path {CACHE_PATH} does not exist. Attempting to create..") os.mkdir(CACHE_PATH) logging.warning("Success") -BUILDING_METADATA_PARQUET_PATH = RESSTOCK_PATH + 'metadata_and_annual_results/national/parquet/baseline_metadata_only.parquet' -HOURLY_OUTPUT_PATH = RESSTOCK_PATH + 'timeseries_individual_buildings/by_state/upgrade={upgrade_id}/state={state}/{building_id}-{upgrade_id}.parquet' +BUILDING_METADATA_PARQUET_PATH = ( + RESSTOCK_PATH + + "metadata_and_annual_results/national/parquet/baseline_metadata_only.parquet" +) +HOURLY_OUTPUT_PATH = ( + RESSTOCK_PATH + + "timeseries_individual_buildings/by_state/upgrade={upgrade_id}/state={state}/{building_id}-{upgrade_id}.parquet" +) # pattern of weather files path within RESSTOCK_PATH # examples: # `resstock_tmy3_release_1`, `resstock_tmy3_release_1.1`: @@ -66,79 +72,79 @@ # `.../weather/state={state}/{geoid}_f018.csv` # `comstock_amy2018_release_2`: # `.../weather/amy2018/{geoid}_2018.csv` -WEATHER_FILES_PATH = RESSTOCK_PATH + 'weather/state={state}/{geoid}_TMY3.csv' +WEATHER_FILES_PATH = RESSTOCK_PATH + "weather/state={state}/{geoid}_TMY3.csv" STATE_2NUM_CODE_TO_2LETTER = { # Note: keys are intentionally strings to simplify parsing county geoid - '01': 'AL', - '02': 'AK', - '04': 'AZ', - '05': 'AR', - '06': 'CA', - '08': 'CO', - '09': 'CT', - '10': 'DE', - '11': 'DC', - '12': 'FL', - '13': 'GA', - '16': 'ID', - '17': 'IL', - '18': 'IN', - '19': 'IA', - '20': 'KS', - '21': 'KY', - '22': 'LA', - '23': 'ME', - '24': 'MD', - '25': 'MA', - '26': 'MI', - '27': 'MN', - '28': 'MS', - '29': 'MO', - '30': 'MT', - '31': 'NE', - '32': 'NV', - '33': 'NH', - '34': 'NJ', - '35': 'NM', - '36': 'NY', - '37': 'NC', - '38': 'ND', - '39': 'OH', - '40': 'OK', - '41': 'OR', - '42': 'PA', - '44': 'RI', - '45': 'SC', - '46': 'SD', - '47': 'TN', - '48': 'TX', - '49': 'UT', - '50': 'VT', - '51': 'VA', - '53': 'WA', - '54': 'WV', - '55': 'WI', - '56': 'WY', + "01": "AL", + "02": "AK", + "04": "AZ", + "05": "AR", + "06": "CA", + "08": "CO", + "09": "CT", + "10": "DE", + "11": "DC", + "12": "FL", + "13": "GA", + "16": "ID", + "17": "IL", + "18": "IN", + "19": "IA", + "20": "KS", + "21": "KY", + "22": "LA", + "23": "ME", + "24": "MD", + "25": "MA", + "26": "MI", + "27": "MN", + "28": "MS", + "29": "MO", + "30": "MT", + "31": "NE", + "32": "NV", + "33": "NH", + "34": "NJ", + "35": "NM", + "36": "NY", + "37": "NC", + "38": "ND", + "39": "OH", + "40": "OK", + "41": "OR", + "42": "PA", + "44": "RI", + "45": "SC", + "46": "SD", + "47": "TN", + "48": "TX", + "49": "UT", + "50": "VT", + "51": "VA", + "53": "WA", + "54": "WV", + "55": "WI", + "56": "WY", } ORIENTATION_DEGREES = { - 'North': 0, - 'Northeast': 45, - 'East': 90, - 'Southeast': 135, - 'South': 180, - 'Southwest': 225, - 'West': 270, - 'Northwest': 315, + "North": 0, + "Northeast": 45, + "East": 90, + "Southeast": 135, + "South": 180, + "Southwest": 225, + "West": 270, + "Northwest": 315, } # https://en.wikipedia.org/wiki/Luminous_efficacy LUMINOUS_EFFICACY = { - '100% CFL': 0.12, # 8-15% - '100% Incandescent': 0.02, # 1.2-2.6% - '100% LED': 0.15 # 11-30% + "100% CFL": 0.12, # 8-15% + "100% Incandescent": 0.02, # 1.2-2.6% + "100% LED": 0.15, # 11-30% } def extract_percentage(value): - """ Extract percentage of space given + """Extract percentage of space given >>> extract_percentage('100% Conditioned') 1.0 @@ -149,31 +155,30 @@ def extract_percentage(value): >>> extract_percentage('10% Leakage, Uninsulated') 0.1 """ - if value == 'None': + if value == "None": return 0.0 - match = re.match(r'^ int: - """ vintage of the building in the year of 2000 + """vintage of the building in the year of 2000 >>> vintage2age2000('<1940') 70 >>> vintage2age2000('1960s') 40 """ vintage = vintage.strip() - if vintage.startswith('<'): # '<1940' bin in resstock + if vintage.startswith("<"): # '<1940' bin in resstock return 70 return 2000 - int(vintage[:4]) def extract_r_value(construction_type: str) -> int: - """ Extract R-value from an unformatted string + """Extract R-value from an unformatted string Assumption: all baseline walls have similar R-value of ~4. The returned value is for additional insulation only. Examples: @@ -195,19 +200,19 @@ def extract_r_value(construction_type: str) -> int: 19 """ lower = construction_type.lower() - if lower == 'none' or 'uninsulated' in lower: + if lower == "none" or "uninsulated" in lower: return 0 m = re.search(r"\br-?(\d+)\b", construction_type, flags=re.I) if not m: raise ValueError( - f'Cannot determine R-value of the construction type: ' - f'{construction_type}' + f"Cannot determine R-value of the construction type: " + f"{construction_type}" ) return int(m.group(1)) def extract_cooling_efficiency(cooling_efficiency: str) -> float: - """ Convert a ResStock cooling efficiency into EER value + """Convert a ResStock cooling efficiency into EER value Cooling in ResStock building metadata comes either in `in.hvac_cooling_efficiency` (for normal ACs), or in @@ -229,13 +234,13 @@ def extract_cooling_efficiency(cooling_efficiency: str) -> float: ac_type = cooling_efficiency.split(", ", 1)[0].strip() efficiency = cooling_efficiency.rsplit(", ", 1)[-1].strip() # two special cases - if ac_type == 'None': + if ac_type == "None": # insanely high efficiency to mimic a nonexistent cooling return 999 - if ac_type == 'Heat Pump' and efficiency == ac_type: + if ac_type == "Heat Pump" and efficiency == ac_type: # a default value as we don't have anything else. # Min SEER for heat pumps is 13 by law, 13*.875 ~= 11.4 - return 13 * EER_CONVERSION['SEER'] + return 13 * EER_CONVERSION["SEER"] m = re.search(r"\b(SEER2|SEER|EER)\s+(\d+\.?\d*)", cooling_efficiency) if m: @@ -243,9 +248,7 @@ def extract_cooling_efficiency(cooling_efficiency: str) -> float: return EER_CONVERSION[m.group(1)] * float(m.group(2)) except (ValueError, KeyError): pass - raise ValueError( - f'Cannot extract cooling efficiency from: {cooling_efficiency}' - ) + raise ValueError(f"Cannot extract cooling efficiency from: {cooling_efficiency}") def extract_heating_efficiency(heating_efficiency: str) -> int: @@ -272,20 +275,20 @@ def extract_heating_efficiency(heating_efficiency: str) -> int: number = float(efficiency.strip().split(" ", 1)[0].strip("%")) except ValueError: raise ValueError( - f'Cannot extract heating efficiency from: {heating_efficiency}' + f"Cannot extract heating efficiency from: {heating_efficiency}" ) if efficiency.endswith("AFUE"): return int(number) if efficiency.endswith("HSPF"): - return int(number*100/BTU_PER_WH) + return int(number * 100 / BTU_PER_WH) # 'Other' - e.g. wood stove - is not supported return int(number) def temp70(temperature_string): - """ Convert string Fahrenheit degrees to float F - 70 deg + """Convert string Fahrenheit degrees to float F - 70 deg >>> temp70('70F') 0.0 @@ -293,8 +296,7 @@ def temp70(temperature_string): -10.0 """ if not re.match(r"\d+F", temperature_string): - raise ValueError( - f"Unrecognized temperature format: {temperature_string}") + raise ValueError(f"Unrecognized temperature format: {temperature_string}") return float(temperature_string.strip().lower()[:-1]) - 70 @@ -318,7 +320,7 @@ def extract_window_area(value): @file_cache(CACHE_PATH) def _get_building_metadata(): - """ Helper function to retrieve and clean building metadata + """Helper function to retrieve and clean building metadata >>> metadata_df = _get_building_metadata() >>> isinstance(metadata_df, pd.DataFrame) @@ -332,128 +334,151 @@ def _get_building_metadata(): BUILDING_METADATA_PARQUET_PATH, columns=[ # features used directly or transformed - 'in.sqft', 'in.bedrooms', 'in.geometry_stories', - 'in.vintage', 'in.geometry_building_number_units_mf', - 'in.geometry_building_number_units_sfa', + "in.sqft", + "in.bedrooms", + "in.geometry_stories", + "in.vintage", + "in.geometry_building_number_units_mf", + "in.geometry_building_number_units_sfa", # features to be used to join with other datasets - 'in.county', # weather files + "in.county", # weather files # features that will be replaced with "reasonable assumptions" - 'in.occupants', + "in.occupants", # it's either ceiling or roof; only ~15K (<3%) have none - 'in.insulation_ceiling', 'in.insulation_roof', - 'in.insulation_floor', - 'in.insulation_slab', 'in.insulation_rim_joist', - 'in.infiltration', - - 'in.hvac_cooling_efficiency', 'in.hvac_heating_efficiency', - 'in.ducts', 'in.hvac_has_ducts', + "in.insulation_ceiling", + "in.insulation_roof", + "in.insulation_floor", + "in.insulation_slab", + "in.insulation_rim_joist", + "in.infiltration", + "in.hvac_cooling_efficiency", + "in.hvac_heating_efficiency", + "in.ducts", + "in.hvac_has_ducts", # to be filtered on - 'in.has_pv', 'in.geometry_building_type_acs', + "in.has_pv", + "in.geometry_building_type_acs", # ashrae_iecc_climate_zone_2004_2_a_split splits 2A states into # two groups, otherwise it's the same - 'in.ashrae_iecc_climate_zone_2004', - 'in.cooling_setpoint', 'in.heating_setpoint', - 'in.hvac_cooling_partial_space_conditioning', - + "in.ashrae_iecc_climate_zone_2004", + "in.cooling_setpoint", + "in.heating_setpoint", + "in.hvac_cooling_partial_space_conditioning", # Cooling/Heating offset must be important, too hard to get from # user. Also, it will have to be transformed into a timeseries var # 'in.cooling_setpoint_offset_magnitude', # 'in.cooling_setpoint_offset_period' # 'in.heating_setpoint_offset_magnitude', # 'in.heating_setpoint_offset_period' - - 'in.orientation', 'in.window_areas', - + "in.orientation", + "in.window_areas", # String/CATEGORICAL - 'in.geometry_foundation_type', 'in.windows', - 'in.lighting', 'in.insulation_wall', 'in.geometry_attic_type', + "in.geometry_foundation_type", + "in.windows", + "in.lighting", + "in.insulation_wall", + "in.geometry_attic_type", ], ).rename( # to make this code interchangeable with the spark tables columns={ - 'in.sqft': 'sqft', - 'in.bedrooms': 'bedrooms', - 'in.geometry_stories': 'stories', - 'in.occupants': 'occupants', - 'in.county': 'county', - 'in.ashrae_iecc_climate_zone_2004': 'ashrae_iecc_climate_zone', - 'in.geometry_foundation_type': 'foundation_type', - 'in.windows': 'windows_type', - 'in.insulation_wall': 'wall_type', - 'in.geometry_attic_type': 'attic_type', + "in.sqft": "sqft", + "in.bedrooms": "bedrooms", + "in.geometry_stories": "stories", + "in.occupants": "occupants", + "in.county": "county", + "in.ashrae_iecc_climate_zone_2004": "ashrae_iecc_climate_zone", + "in.geometry_foundation_type": "foundation_type", + "in.windows": "windows_type", + "in.insulation_wall": "wall_type", + "in.geometry_attic_type": "attic_type", } ) - pq.index.rename('building_id', inplace=True) + pq.index.rename("building_id", inplace=True) pq = pq[ - (pq['in.geometry_building_type_acs'] == 'Single-Family Detached') - & (pq['occupants'] != '10+') + (pq["in.geometry_building_type_acs"] == "Single-Family Detached") + & (pq["occupants"] != "10+") # sanity check; it's 1 for all single family detached # & (pq[ # ['in.geometry_building_number_units_mf', # 'in.geometry_building_number_units_sfa'] # ].replace('None', 1).max(axis=1).fillna(1).astype(int) == 1) # another sanity check; ResStock single family detached have 3 max - & (pq['stories'] <= '5') + & (pq["stories"] <= "5") # for some reason there are 14K 8194sqf single family detached homes - & (pq['sqft'] < 8000) + & (pq["sqft"] < 8000) # Not sure how to model these yet - & ~pq['in.hvac_heating_efficiency'].isin(['Other', 'Shared Heating']) - & (pq['in.hvac_cooling_efficiency'] != 'Shared Cooling') + & ~pq["in.hvac_heating_efficiency"].isin(["Other", "Shared Heating"]) + & (pq["in.hvac_cooling_efficiency"] != "Shared Cooling") # we'll get to solar, eventually - just not yet - & (pq['in.has_pv'] == 'No') + & (pq["in.has_pv"] == "No") ] pq = pq.assign( - age2000=pq['in.vintage'].map(vintage2age2000), - bedrooms=pq['bedrooms'].astype(int), - stories=pq['stories'].astype(int), - occupants=pq['occupants'].astype(int), - infiltration_ach50=pq['in.infiltration'].str.split().str[0].astype(int), - insulation_wall=pq['wall_type'].map(extract_r_value), - wall_material=pq['wall_type'].str.split(',').str[0], - insulation_slab=pq['in.insulation_slab'].map(extract_r_value), - insulation_rim_joist=pq['in.insulation_rim_joist'].map(extract_r_value), - insulation_floor=pq['in.insulation_floor'].map(extract_r_value), + age2000=pq["in.vintage"].map(vintage2age2000), + bedrooms=pq["bedrooms"].astype(int), + stories=pq["stories"].astype(int), + occupants=pq["occupants"].astype(int), + infiltration_ach50=pq["in.infiltration"].str.split().str[0].astype(int), + insulation_wall=pq["wall_type"].map(extract_r_value), + wall_material=pq["wall_type"].str.split(",").str[0], + insulation_slab=pq["in.insulation_slab"].map(extract_r_value), + insulation_rim_joist=pq["in.insulation_rim_joist"].map(extract_r_value), + insulation_floor=pq["in.insulation_floor"].map(extract_r_value), # In older versions of Pandas it should be `applymap`. - insulation_ceiling_roof=pq[ - ['in.insulation_ceiling', 'in.insulation_roof'] - ].map(extract_r_value).max(axis=1), + insulation_ceiling_roof=pq[["in.insulation_ceiling", "in.insulation_roof"]] + .map(extract_r_value) + .max(axis=1), cooling_efficiency_eer=pq[ - ['in.hvac_cooling_efficiency', 'in.hvac_heating_efficiency'] - ].agg(', '.join, axis=1).map(extract_cooling_efficiency), - heating_efficiency=pq['in.hvac_heating_efficiency'].map( - extract_heating_efficiency), - ac_type=pq['in.hvac_cooling_efficiency'].str.split(',').str[0], + ["in.hvac_cooling_efficiency", "in.hvac_heating_efficiency"] + ] + .agg(", ".join, axis=1) + .map(extract_cooling_efficiency), + heating_efficiency=pq["in.hvac_heating_efficiency"].map( + extract_heating_efficiency + ), + ac_type=pq["in.hvac_cooling_efficiency"].str.split(",").str[0], has_ac=( - pq['in.hvac_cooling_efficiency'].str.split(',').str[0] != 'None' + pq["in.hvac_cooling_efficiency"].str.split(",").str[0] != "None" ).astype(int), - has_ducts=pq['in.hvac_has_ducts'].map({'Yes': 1, 'No': 0}), - ducts_insulation=pq['in.ducts'].map(extract_r_value), - ducts_leakage=pq['in.ducts'].map(extract_percentage), - cooling_setpoint=pq['in.cooling_setpoint'].map(temp70), - heating_setpoint=pq['in.heating_setpoint'].map(temp70), - cooled_space_share=pq['in.hvac_cooling_partial_space_conditioning'].map(extract_percentage), - orientation=pq['in.orientation'].map(ORIENTATION_DEGREES), + has_ducts=pq["in.hvac_has_ducts"].map({"Yes": 1, "No": 0}), + ducts_insulation=pq["in.ducts"].map(extract_r_value), + ducts_leakage=pq["in.ducts"].map(extract_percentage), + cooling_setpoint=pq["in.cooling_setpoint"].map(temp70), + heating_setpoint=pq["in.heating_setpoint"].map(temp70), + cooled_space_share=pq["in.hvac_cooling_partial_space_conditioning"].map( + extract_percentage + ), + orientation=pq["in.orientation"].map(ORIENTATION_DEGREES), # door area in ResStock is always the same (20), and thus, useless - window_area=pq['in.window_areas'].map(extract_window_area), - lighting_efficiency=pq['in.lighting'].map(LUMINOUS_EFFICACY), + window_area=pq["in.window_areas"].map(extract_window_area), + lighting_efficiency=pq["in.lighting"].map(LUMINOUS_EFFICACY), ).drop( columns=[ - 'in.vintage', 'in.geometry_building_type_acs', - 'in.has_pv', 'in.geometry_building_number_units_mf', - 'in.geometry_building_number_units_sfa', - 'in.infiltration', - 'in.insulation_slab', 'in.insulation_rim_joist', - 'in.insulation_floor', - 'in.insulation_ceiling', 'in.insulation_roof', - 'in.hvac_cooling_efficiency', 'in.hvac_heating_efficiency', - 'in.hvac_has_ducts', 'in.ducts', - 'in.cooling_setpoint', 'in.heating_setpoint', - 'in.hvac_cooling_partial_space_conditioning', - 'in.orientation', 'in.window_areas', 'in.lighting', + "in.vintage", + "in.geometry_building_type_acs", + "in.has_pv", + "in.geometry_building_number_units_mf", + "in.geometry_building_number_units_sfa", + "in.infiltration", + "in.insulation_slab", + "in.insulation_rim_joist", + "in.insulation_floor", + "in.insulation_ceiling", + "in.insulation_roof", + "in.hvac_cooling_efficiency", + "in.hvac_heating_efficiency", + "in.hvac_has_ducts", + "in.ducts", + "in.cooling_setpoint", + "in.heating_setpoint", + "in.hvac_cooling_partial_space_conditioning", + "in.orientation", + "in.window_areas", + "in.lighting", ] ) - pq['backup_heating_efficiency'] = pq['heating_efficiency'] + pq["backup_heating_efficiency"] = pq["heating_efficiency"] # extra safety check to eliminate duplicate buildings # (not that there are any) @@ -461,8 +486,8 @@ def _get_building_metadata(): class BuildingMetadataBuilder: - """ A class to cache building metadata in memory. - """ + """A class to cache building metadata in memory.""" + _building_metadata = None def __init__(self): @@ -491,7 +516,7 @@ def __call__(self, building_id) -> pd.Series: def get_state_code_from_county_geoid(county_geoid): - """ Extract two-letter state code from a county geoid in ResStock format + """Extract two-letter state code from a county geoid in ResStock format >>> get_state_code_from_county_geoid('G0200130') 'AK' @@ -523,7 +548,7 @@ def get_state_code_from_county_geoid(county_geoid): @file_cache(CACHE_PATH) def get_hourly_outputs(building_id, upgrade_id, county_geoid): - """ Get hourly timeseries for a combination of building id and an upgrade id + """Get hourly timeseries for a combination of building id and an upgrade id The overall flow reproduces the Spark table created by https://github.com/rewiringamerica/pep/blob/dev/src/process/process_eulp_timeseries.py @@ -561,102 +586,134 @@ def get_hourly_outputs(building_id, upgrade_id, county_geoid): """ state = get_state_code_from_county_geoid(county_geoid) pqpath = HOURLY_OUTPUT_PATH.format( - building_id=building_id, upgrade_id=upgrade_id, state=state) + building_id=building_id, upgrade_id=upgrade_id, state=state + ) # To save RAM, it'd be good to cache the columns of the dataset and read # only the needed ones. So, we need a stateful function - this is a dirty # hack to implement this. - if not hasattr(get_hourly_outputs, 'columns'): - pqtemp = pd.read_parquet(pqpath).sort_values('timestamp') + if not hasattr(get_hourly_outputs, "columns"): + pqtemp = pd.read_parquet(pqpath).sort_values("timestamp") # skipping intensity and emissions columns columns = [ - column for column in pqtemp.columns - if column == 'timestamp' or column.endswith('.energy_consumption') + column + for column in pqtemp.columns + if column == "timestamp" or column.endswith(".energy_consumption") ] - setattr(get_hourly_outputs, 'columns', columns) + setattr(get_hourly_outputs, "columns", columns) column_renames = { - col: col[4:-19] for col in columns - if col.startswith('out.') and col.endswith('.energy_consumption') + col: col[4:-19] + for col in columns + if col.startswith("out.") and col.endswith(".energy_consumption") } - setattr(get_hourly_outputs, 'column_renames', column_renames) - timestep = pqtemp.iloc[1]['timestamp'] - pqtemp.iloc[0]['timestamp'] - setattr(get_hourly_outputs, 'timestep', timestep) + setattr(get_hourly_outputs, "column_renames", column_renames) + timestep = pqtemp.iloc[1]["timestamp"] - pqtemp.iloc[0]["timestamp"] + setattr(get_hourly_outputs, "timestep", timestep) fuel_types = set() appliance_types = set() appliance_groups = {} for column in column_renames.values(): - if '.' not in column: # timestamp + if "." not in column: # timestamp continue - fuel_type, appliance = column.split('.', 1) + fuel_type, appliance = column.split(".", 1) fuel_types.add(fuel_type) appliance_types.add(appliance) appliance_groups.setdefault(appliance, []) appliance_groups[appliance].append(column) - fuel_types -= {'site_energy'} + fuel_types -= {"site_energy"} # maybe remove appliances not covered by upgrades: grill, pool_pump, # and hot tub heater - appliance_types -= {'total', 'net', 'grill', 'pool_pump', } - setattr(get_hourly_outputs, 'fuel_types', fuel_types) - setattr(get_hourly_outputs, 'appliance_types', appliance_types) - setattr(get_hourly_outputs, 'appliance_groups', appliance_groups) + appliance_types -= { + "total", + "net", + "grill", + "pool_pump", + } + setattr(get_hourly_outputs, "fuel_types", fuel_types) + setattr(get_hourly_outputs, "appliance_types", appliance_types) + setattr(get_hourly_outputs, "appliance_groups", appliance_groups) # appliance mapping to aggregate by purpose # TODO: make groups separable by fuel, e.g. backup heating should be # a separate group. Heating/cooling fans should be separate, too. - setattr(get_hourly_outputs, 'consumption_groups', { - 'heating': [ - 'heating', 'heating_fans_pumps', 'heating_hp_bkup', - # fireplace in ResStock are only gas powered (i.e., not wood) - # and should be counted towards heating - 'fireplace', - ], - 'cooling': ['cooling', 'cooling_fans_pumps', ], - 'lighting': [ - 'lighting', 'lighting_interior', 'lighting_exterior', - 'lighting_garage', - ], - 'other': [ - 'hot_tub_heater', 'hot_tub_pump', 'hot_water', 'well_pump', - 'dishwasher', 'freezer', 'refrigerator', 'grill', 'range_oven', - # should fans and mech vent be considered cooling/heating? - 'ceiling_fan', 'mech_vent', - 'pool_heater', 'pool_pump', - 'clothes_dryer', 'clothes_washer', - 'plug_loads', # pv, # not considering solar (pv) yet - ] - }) + setattr( + get_hourly_outputs, + "consumption_groups", + { + "heating": [ + "heating", + "heating_fans_pumps", + "heating_hp_bkup", + # fireplace in ResStock are only gas powered (i.e., not wood) + # and should be counted towards heating + "fireplace", + ], + "cooling": [ + "cooling", + "cooling_fans_pumps", + ], + "lighting": [ + "lighting", + "lighting_interior", + "lighting_exterior", + "lighting_garage", + ], + "other": [ + "hot_tub_heater", + "hot_tub_pump", + "hot_water", + "well_pump", + "dishwasher", + "freezer", + "refrigerator", + "grill", + "range_oven", + # should fans and mech vent be considered cooling/heating? + "ceiling_fan", + "mech_vent", + "pool_heater", + "pool_pump", + "clothes_dryer", + "clothes_washer", + "plug_loads", # pv, # not considering solar (pv) yet + ], + }, + ) ho = ( pd.read_parquet(pqpath, columns=get_hourly_outputs.columns) - .set_index('timestamp') + .set_index("timestamp") .sort_index() ) # timestamps indicate the end of the period. # To make use of pandas resampling, they should be set at the start - ho = ( - ho.set_index(ho.index-get_hourly_outputs.timestep) - .rename(columns=get_hourly_outputs.column_renames) + ho = ho.set_index(ho.index - get_hourly_outputs.timestep).rename( + columns=get_hourly_outputs.column_renames ) - ho = pd.DataFrame({ - appliance: ho[col_names].sum(axis=1) - for appliance, col_names in get_hourly_outputs.appliance_groups.items() - }) - ho = pd.DataFrame({ - group_name: ho[col_names].sum(axis=1) - for group_name, col_names in get_hourly_outputs.consumption_groups.items() - }) + ho = pd.DataFrame( + { + appliance: ho[col_names].sum(axis=1) + for appliance, col_names in get_hourly_outputs.appliance_groups.items() + } + ) + ho = pd.DataFrame( + { + group_name: ho[col_names].sum(axis=1) + for group_name, col_names in get_hourly_outputs.consumption_groups.items() + } + ) - return ho.resample('H').sum() + return ho.resample("H").sum() # a predefined -REFERENCE_YEAR = pd.date_range(start='1/1/2007', periods=HOURS_IN_A_YEAR, freq='H') +REFERENCE_YEAR = pd.date_range(start="1/1/2007", periods=HOURS_IN_A_YEAR, freq="H") @file_cache(CACHE_PATH) def get_weather_file(county_geoid: str) -> pd.DataFrame: - """ Retrieve weather timeseries for a given county geoid in ResStock + """Retrieve weather timeseries for a given county geoid in ResStock It takes about 150..200ms to read a file from a GCP bucket. With ~3K files, that's ~10min worst case for the entire dataset. This function returns all @@ -684,16 +741,18 @@ def get_weather_file(county_geoid: str) -> pd.DataFrame: state = get_state_code_from_county_geoid(county_geoid) weather_file_path = WEATHER_FILES_PATH.format(state=state, geoid=county_geoid) df = pd.read_csv( - weather_file_path, parse_dates=['date_time'], index_col=['date_time'] - ).rename(columns={ - 'Dry Bulb Temperature [°C]': 'temp_air', - 'Relative Humidity [%]': 'relative_humidity', - 'Wind Speed [m/s]': 'wind_speed', - 'Wind Direction [Deg]': 'wind_direction', - 'Global Horizontal Radiation [W/m2]': 'ghi', - 'Direct Normal Radiation [W/m2]': 'dni', - 'Diffuse Horizontal Radiation [W/m2]': 'diffuse_horizontal_illum' - }) + weather_file_path, parse_dates=["date_time"], index_col=["date_time"] + ).rename( + columns={ + "Dry Bulb Temperature [°C]": "temp_air", + "Relative Humidity [%]": "relative_humidity", + "Wind Speed [m/s]": "wind_speed", + "Wind Direction [Deg]": "wind_direction", + "Global Horizontal Radiation [W/m2]": "ghi", + "Direct Normal Radiation [W/m2]": "dni", + "Diffuse Horizontal Radiation [W/m2]": "diffuse_horizontal_illum", + } + ) # in TMY3 files, weather year is a combination of months from different # years. Resstock overrides year for these files, so only month-day-hour # portion matters @@ -712,7 +771,7 @@ def get_weather_file(county_geoid: str) -> pd.DataFrame: def apply_upgrades(building_features: pd.Series, upgrade_id: int) -> pd.Series: - """ Augment building features to reflect the upgrade + """Augment building features to reflect the upgrade Thoughts: it is more efficient to apply these upgrades to an entire dataframe, but it is a lot harder to test @@ -749,19 +808,21 @@ def apply_upgrades(building_features: pd.Series, upgrade_id: int) -> pd.Series: if upgrade_id == 1: # basic enclosure # applies only to vented attic dwellings - cz_family = building_features['ashrae_iecc_climate_zone'][0] + cz_family = building_features["ashrae_iecc_climate_zone"][0] threshold, insulation = BASIC_ENCLOSURE_INSULATION[cz_family] - if (building_features['attic_type'] == 'Vented Attic' - and building_features['insulation_ceiling_roof'] <= threshold): - building_features['insulation_ceiling_roof'] = insulation + if ( + building_features["attic_type"] == "Vented Attic" + and building_features["insulation_ceiling_roof"] <= threshold + ): + building_features["insulation_ceiling_roof"] = insulation # Manual has two thresholds, 10 and 15. In the .yml it's applied at 15 - if building_features['infiltration_ach50'] >= 15: - building_features['infiltration_ach50'] *= 0.7 - if building_features['ducts_leakage'] > 0: - building_features['ducts_leakage'] = 0.1 - building_features['ducts_insulation'] = 8.0 - if building_features['wall_type'] == 'Wood Stud, Uninsulated': - building_features['insulation_wall'] = extract_r_value('Wood Stud, R-13') + if building_features["infiltration_ach50"] >= 15: + building_features["infiltration_ach50"] *= 0.7 + if building_features["ducts_leakage"] > 0: + building_features["ducts_leakage"] = 0.1 + building_features["ducts_insulation"] = 8.0 + if building_features["wall_type"] == "Wood Stud, Uninsulated": + building_features["insulation_wall"] = extract_r_value("Wood Stud, R-13") return building_features # if upgrade_id == 2: # enhanced enclosure @@ -769,32 +830,50 @@ def apply_upgrades(building_features: pd.Series, upgrade_id: int) -> pd.Series: if upgrade_id == 3: # heat pump, min efficiency, electric backup # both ducted and ductless: SEER 15, 9 HSPF - building_features['cooling_efficiency_eer'] = extract_cooling_efficiency('Heat Pump, SEER 15, 9 HSPF') - building_features['heating_efficiency'] = extract_heating_efficiency('Heat Pump, SEER 15, 9 HSPF') - building_features['backup_heating_efficiency'] = 1.0 - building_features['ac_type'] = 'Heat Pump' - building_features['has_ac'] = 1 + building_features["cooling_efficiency_eer"] = extract_cooling_efficiency( + "Heat Pump, SEER 15, 9 HSPF" + ) + building_features["heating_efficiency"] = extract_heating_efficiency( + "Heat Pump, SEER 15, 9 HSPF" + ) + building_features["backup_heating_efficiency"] = 1.0 + building_features["ac_type"] = "Heat Pump" + building_features["has_ac"] = 1 return building_features if upgrade_id == 4: # heat pump, high efficiency, electric backup - if building_features['has_ducts']: # ducted systems: SEER 24, 13 HSPF - building_features['cooling_efficiency_eer'] = extract_cooling_efficiency('Heat Pump, SEER 24, 13 HSPF') - building_features['heating_efficiency'] = extract_heating_efficiency('Heat Pump, SEER 24, 13 HSPF') + if building_features["has_ducts"]: # ducted systems: SEER 24, 13 HSPF + building_features["cooling_efficiency_eer"] = extract_cooling_efficiency( + "Heat Pump, SEER 24, 13 HSPF" + ) + building_features["heating_efficiency"] = extract_heating_efficiency( + "Heat Pump, SEER 24, 13 HSPF" + ) else: # ductless dwellings: SEER 29.3, 14 HSPF, - building_features['cooling_efficiency_eer'] = extract_cooling_efficiency('Heat Pump, SEER 29.3, 14 HSPF') - building_features['heating_efficiency'] = extract_heating_efficiency('Heat Pump, SEER 29.3, 14 HSPF') - building_features['backup_heating_efficiency'] = 1.0 - building_features['ac_type'] = 'Heat Pump' - building_features['has_ac'] = 1 + building_features["cooling_efficiency_eer"] = extract_cooling_efficiency( + "Heat Pump, SEER 29.3, 14 HSPF" + ) + building_features["heating_efficiency"] = extract_heating_efficiency( + "Heat Pump, SEER 29.3, 14 HSPF" + ) + building_features["backup_heating_efficiency"] = 1.0 + building_features["ac_type"] = "Heat Pump" + building_features["has_ac"] = 1 return building_features if upgrade_id == 5: # high efficiency HP, existing heating as backup # both ducted and ductless: SEER 15, 9 HSPF - building_features['backup_heating_efficiency'] = building_features['heating_efficiency'] - building_features['cooling_efficiency_eer'] = extract_cooling_efficiency('Heat Pump, SEER 15, 9 HSPF') - building_features['heating_efficiency'] = extract_heating_efficiency('Heat Pump, SEER 15, 9 HSPF') - building_features['ac_type'] = 'Heat Pump' - building_features['has_ac'] = 1 + building_features["backup_heating_efficiency"] = building_features[ + "heating_efficiency" + ] + building_features["cooling_efficiency_eer"] = extract_cooling_efficiency( + "Heat Pump, SEER 15, 9 HSPF" + ) + building_features["heating_efficiency"] = extract_heating_efficiency( + "Heat Pump, SEER 15, 9 HSPF" + ) + building_features["ac_type"] = "Heat Pump" + building_features["has_ac"] = 1 return building_features # if upgrade_id == 6: # heat pump water heater @@ -809,7 +888,7 @@ def apply_upgrades(building_features: pd.Series, upgrade_id: int) -> pd.Series: def train_test_split(dataset: np.array, left_size): - """ Split the provided array into two random shares + """Split the provided array into two random shares Why: `tf.keras.utils.split_dataset()`-based iterators are a bit slow for small experiments, with iteration over 500k examples taking 25s. @@ -818,12 +897,12 @@ def train_test_split(dataset: np.array, left_size): `tf.keras.utils.split_dataset()` is still the preferred option """ np.random.shuffle(dataset) - split_point = int(len(dataset)*left_size) + split_point = int(len(dataset) * left_size) return dataset[:split_point], dataset[split_point:] def parallelize(func, args, num_threads=None): - num_threads = num_threads or min(os.cpu_count()*10, 50) + num_threads = num_threads or min(os.cpu_count() * 10, 50) tp = ThreadPool(num_threads) for _ in tp.imap_unordered(func, args): pass @@ -835,29 +914,46 @@ def parallelize(func, args, num_threads=None): class DataGen(tf.keras.utils.Sequence): batch_size: int upgrades = (0, 1, 3, 4, 5) - weather_features = ('temp_air', 'ghi', 'wind_speed', 'weekend', 'hour') + weather_features = ("temp_air", "ghi", "wind_speed", "weekend", "hour") # features model will be trained on by default. # For all available features, check self.building_features_df columns building_features = ( # numeric - 'sqft', 'bedrooms', 'stories', 'occupants', 'age2000', - 'infiltration_ach50', 'insulation_wall', 'insulation_ceiling_roof', - 'cooling_efficiency_eer', 'heating_efficiency', - 'backup_heating_efficiency', 'has_ducts', - 'insulation_slab', 'insulation_rim_joist', 'insulation_floor', - 'cooling_setpoint', 'heating_setpoint', 'orientation', 'window_area', - 'lighting_efficiency', 'cooled_space_share', - + "sqft", + "bedrooms", + "stories", + "occupants", + "age2000", + "infiltration_ach50", + "insulation_wall", + "insulation_ceiling_roof", + "cooling_efficiency_eer", + "heating_efficiency", + "backup_heating_efficiency", + "has_ducts", + "insulation_slab", + "insulation_rim_joist", + "insulation_floor", + "cooling_setpoint", + "heating_setpoint", + "orientation", + "window_area", + "lighting_efficiency", + "cooled_space_share", # categorical - 'foundation_type', 'windows_type', 'wall_material', - + "foundation_type", + "windows_type", + "wall_material", # service features - not to be fed to the model # 'county', ) # skipping 'other' and 'lighting' here. Both are artificial and are unlikely # to predict real life usage well - consumption_groups = ('heating', 'cooling',) + consumption_groups = ( + "heating", + "cooling", + ) time_granularity = None # Building ids only. Not used, for debugging purpose only building_ids: np.array @@ -876,10 +972,18 @@ class DataGen(tf.keras.utils.Sequence): # weather cache has a 2-level index, (weather_station, time_step) weather_cache = None - def __init__(self, building_ids, upgrade_ids=None, weather_features=None, - building_features=None, consumption_groups=None, - time_granularity='Y', batch_size=64, metadata_builder=None, - dtype=np.float32): + def __init__( + self, + building_ids, + upgrade_ids=None, + weather_features=None, + building_features=None, + consumption_groups=None, + time_granularity="Y", + batch_size=64, + metadata_builder=None, + dtype=np.float32, + ): """ Args: building_ids: (Iterable[int]) integer ids of the buildings in this @@ -919,19 +1023,19 @@ def __init__(self, building_ids, upgrade_ids=None, weather_features=None, self.metadata_builder = metadata_builder or BuildingMetadataBuilder() self.dtype = dtype number_of_time_steps = { - 'D': 365, - 'M': 12, - 'Q': 4, - 'Y': 1, + "D": 365, + "M": 12, + "Q": 4, + "Y": 1, }[time_granularity] self.timestep_length = HOURS_IN_A_YEAR // number_of_time_steps time_steps = tuple(range(number_of_time_steps)) self.ids = pd.DataFrame( itertools.product(self.building_ids, self.upgrades, time_steps), - columns=['building_id', 'upgrade_id', 'time_step'] + columns=["building_id", "upgrade_id", "time_step"], ) - counties = self.metadata_builder.all().loc[self.building_ids, 'county'] - self.ids['county'] = self.ids['building_id'].map(counties) + counties = self.metadata_builder.all().loc[self.building_ids, "county"] + self.ids["county"] = self.ids["building_id"].map(counties) self.building_features_df = self.init_building_cache() self.weather_cache = self.init_weather_cache() @@ -947,17 +1051,17 @@ def init_building_cache(self): # required by `apply_upgrades` df = self.metadata_builder.all().loc[self.building_ids] # `.explode()` needs a column of iterables, thus a column of tuples - df['upgrade_id'] = [self.upgrades]*len(df) - df = df.explode('upgrade_id') - building_features_df = pd.DataFrame([ - apply_upgrades(row, row['upgrade_id']) for _, row in df.iterrows() - ]) - building_features_df.index.name = 'building_id' + df["upgrade_id"] = [self.upgrades] * len(df) + df = df.explode("upgrade_id") + building_features_df = pd.DataFrame( + [apply_upgrades(row, row["upgrade_id"]) for _, row in df.iterrows()] + ) + building_features_df.index.name = "building_id" building_features_df.reset_index(inplace=True) - return building_features_df.set_index(['building_id', 'upgrade_id']) + return building_features_df.set_index(["building_id", "upgrade_id"]) def batch_building_features(self, batch_ids): - """ Get building features for a batch + """Get building features for a batch This method is intended to abstract the structure of building feature cache. @@ -968,15 +1072,12 @@ def batch_building_features(self, batch_ids): Returns: Dict[str, np.ndarray]: a dictionary of building features """ - idx = batch_ids[['building_id', 'upgrade_id']].apply(tuple, axis=1) + idx = batch_ids[["building_id", "upgrade_id"]].apply(tuple, axis=1) df = self.building_features_df.loc[idx] - return { - feature: df[feature].values - for feature in self.building_features - } + return {feature: df[feature].values for feature in self.building_features} def init_weather_cache(self): - """ Initialize in-memory weather cache + """Initialize in-memory weather cache TODO: accommodate for weather embeddings The intent of this method is to build an in-memory cache of weather data @@ -996,10 +1097,9 @@ def init_weather_cache(self): month, or day. As a result, we'll get a dataframe with a 2-level multiindex ( """ - counties = self.ids['county'].unique() + counties = self.ids["county"].unique() weather_cache = {} - logging.warning( - f"Building weather cache for {len(counties)} counties...") + logging.warning(f"Building weather cache for {len(counties)} counties...") def fill_cache(county_geoid): weather_cache[county_geoid] = get_weather_file(county_geoid) @@ -1008,14 +1108,14 @@ def fill_cache(county_geoid): logging.warning("...almost done") weather_cache = { - feature: pd.DataFrame([ - weather_cache[county][feature] for county in counties - ], index=counties) + feature: pd.DataFrame( + [weather_cache[county][feature] for county in counties], index=counties + ) for feature in self.weather_features } def aggregate_by_time(df, feature_name): - """ Slice timeseries according to time granularity. + """Slice timeseries according to time granularity. The input df is expected to have counties as index and dt columns @@ -1023,15 +1123,13 @@ def aggregate_by_time(df, feature_name): pd.DataFrame: a dataframe with a single column of numpy arrays """ df.columns = np.arange(len(df.columns)) - df.columns.name = 'timestamp' - df.index.name = 'county' - df = pd.DataFrame( - {feature_name: df.stack()} - ).reset_index() - df['time_step'] = df['timestamp'] // self.timestep_length - df['timestamp'] %= self.timestep_length - df.set_index(['county', 'time_step'], inplace=True) - return df.pivot(columns='timestamp', values=feature_name) + df.columns.name = "timestamp" + df.index.name = "county" + df = pd.DataFrame({feature_name: df.stack()}).reset_index() + df["time_step"] = df["timestamp"] // self.timestep_length + df["timestamp"] %= self.timestep_length + df.set_index(["county", "time_step"], inplace=True) + return df.pivot(columns="timestamp", values=feature_name) weather_cache = { feature: aggregate_by_time(df, feature) @@ -1041,7 +1139,7 @@ def aggregate_by_time(df, feature_name): return weather_cache def batch_weather_features(self, batch_ids): - """ Get weather features for a batch + """Get weather features for a batch Similar to `batch_building_features`, the purpose is to abstract structure of weather cache. @@ -1055,7 +1153,7 @@ def batch_weather_features(self, batch_ids): size - e.g., with monthly aggregation some months are 30 days and some are 28 or 31. """ - idx = batch_ids[['county', 'time_step']].apply(tuple, axis=1) + idx = batch_ids[["county", "time_step"]].apply(tuple, axis=1) return { feature: self.weather_cache[feature].loc[idx].values for feature in self.weather_features @@ -1069,11 +1167,11 @@ def init_output_cache(self): """ ids = [ (building_id, upgrade_id, county) - for (building_id, upgrade_id), county - in self.building_features_df['county'].items() + for (building_id, upgrade_id), county in self.building_features_df[ + "county" + ].items() ] - logging.warning( - f"Building output cache for {len(ids)} buildings...") + logging.warning(f"Building output cache for {len(ids)} buildings...") output_cache = {} def fill_cache(batch_id): @@ -1085,21 +1183,20 @@ def fill_cache(batch_id): logging.warning("...almost done") output_cache = { - group: pd.DataFrame([ - output_cache[batch_id][group] for batch_id in ids - ], index=self.building_features_df.index) + group: pd.DataFrame( + [output_cache[batch_id][group] for batch_id in ids], + index=self.building_features_df.index, + ) for group in self.consumption_groups } def aggregate_by_time(df, feature_name): - """ Slice timeseries according to time granularity. + """Slice timeseries according to time granularity. The input df is expected to have counties as index and dt columns """ df.columns.name = self.time_granularity - return pd.DataFrame( - {feature_name: df.stack()} - ) + return pd.DataFrame({feature_name: df.stack()}) output_cache = { feature: aggregate_by_time(df, feature) @@ -1110,19 +1207,18 @@ def aggregate_by_time(df, feature_name): return output_cache def batch_output_features(self, batch_ids): - idx = batch_ids[ - ['building_id', 'upgrade_id', 'time_step']].apply(tuple, axis=1) + idx = batch_ids[["building_id", "upgrade_id", "time_step"]].apply(tuple, axis=1) return { group: self.output_cache[group].loc[idx].values for group in self.consumption_groups } def feature_dtype(self, feature_name): - is_string_feature = self.building_features_df[feature_name].dtype == 'O' + is_string_feature = self.building_features_df[feature_name].dtype == "O" return tf.string if is_string_feature else self.dtype def feature_vocab(self, feature_name): - """ Get all possible values for a feature + """Get all possible values for a feature This method is used to create encoders for string (categorical/ordinal) features @@ -1139,11 +1235,11 @@ def on_epoch_end(self): pass def __getitem__(self, idx): - """ Generate a batch #`idx` + """Generate a batch #`idx` This method should produce a dictionary of numpy arrays (or tensors) """ - batch_ids = self.ids[idx*self.batch_size:(idx+1)*self.batch_size] + batch_ids = self.ids[idx * self.batch_size : (idx + 1) * self.batch_size] # for last batch, batch_size might be different from self.batch_size features = self.batch_building_features(batch_ids) features.update(self.batch_weather_features(batch_ids)) @@ -1151,7 +1247,7 @@ def __getitem__(self, idx): return features, self.batch_output_features(batch_ids) -if __name__ == '__main__': +if __name__ == "__main__": np.random.seed(42) # 42 is always the answer N = 5 get_building_metadata = BuildingMetadataBuilder() diff --git a/deprecated/infra_agnostic/model.py b/deprecated/infra_agnostic/model.py index f03b8b5..81805e6 100644 --- a/deprecated/infra_agnostic/model.py +++ b/deprecated/infra_agnostic/model.py @@ -18,7 +18,8 @@ def create_dataset(datagen_params: Dict, train_test_split=0.9): building_ids = get_building_metadata.building_ids np.random.shuffle(building_ids) train_buildings, test_buildings = datagen.train_test_split( - building_ids, left_size=train_test_split) + building_ids, left_size=train_test_split + ) train_gen = datagen.DataGen(train_buildings, **datagen_params) test_gen = datagen.DataGen(test_buildings, **datagen_params) @@ -31,7 +32,7 @@ def gaussian_activation(x): def replace_weather_with_embeddings(gen, weather_model): - """ Replace weather features in `gen` with embeddings using `weather_model` + """Replace weather features in `gen` with embeddings using `weather_model` Args: gen: (datagen.DataGen) an instance of a data generator class @@ -39,7 +40,7 @@ def replace_weather_with_embeddings(gen, weather_model): # all weather feature dfs are built using the same index. # To be safe, bulletproofing this code sample_weather_feature = gen.weather_features[0] - if sample_weather_feature == 'weather_embedding': + if sample_weather_feature == "weather_embedding": return # already transformed idx = gen.weather_cache[sample_weather_feature].index @@ -47,17 +48,20 @@ def replace_weather_with_embeddings(gen, weather_model): weather_feature: gen.weather_cache[weather_feature].loc[idx].values for weather_feature in gen.weather_features } - weather_embeddings = pd.DataFrame(weather_model.predict(weather_embed_input), index=idx) + weather_embeddings = pd.DataFrame( + weather_model.predict(weather_embed_input), index=idx + ) gen._weather_features = gen.weather_features - gen.weather_features = ['weather_embedding'] - gen.weather_cache['weather_embedding'] = weather_embeddings + gen.weather_features = ["weather_embedding"] + gen.weather_cache["weather_embedding"] = weather_embeddings def create_building_model(train_gen, layer_params): bmo_inputs_dict = { building_feature: layers.Input( - name=building_feature, shape=(1,), - dtype=train_gen.feature_dtype(building_feature) + name=building_feature, + shape=(1,), + dtype=train_gen.feature_dtype(building_feature), ) for building_feature in train_gen.building_features } @@ -69,76 +73,88 @@ def create_building_model(train_gen, layer_params): for feature, layer in bmo_inputs_dict.items(): if train_gen.feature_dtype(feature) == tf.string: encoder = layers.StringLookup( - name=feature+'_encoder', output_mode='one_hot', - dtype=layer_params['dtype'] + name=feature + "_encoder", + output_mode="one_hot", + dtype=layer_params["dtype"], ) encoder.adapt(train_gen.feature_vocab(feature)) layer = encoder(layer) bmo_inputs.append(layer) - m = layers.Concatenate(name='concat_layer', dtype=layer_params['dtype'])(bmo_inputs) + m = layers.Concatenate(name="concat_layer", dtype=layer_params["dtype"])(bmo_inputs) - m = layers.Dense(32, name='second_dense', **layer_params)(m) - m = layers.Dense(8, name='third_dense', **layer_params)(m) + m = layers.Dense(32, name="second_dense", **layer_params)(m) + m = layers.Dense(8, name="third_dense", **layer_params)(m) # TODO: consider applying batchnorm # m = layers.BatchNormalization()(m) bmo = models.Model( - inputs=bmo_inputs_dict, outputs=m, name='building_features_model') + inputs=bmo_inputs_dict, outputs=m, name="building_features_model" + ) return bmo_inputs_dict, bmo def create_weather_model(train_gen, layer_params): weather_inputs_dict = { weather_feature: layers.Input( - name=weather_feature, shape=(None, 1,), dtype=layer_params['dtype']) + name=weather_feature, + shape=( + None, + 1, + ), + dtype=layer_params["dtype"], + ) for weather_feature in train_gen.weather_features } weather_inputs = list(weather_inputs_dict.values()) wm = layers.Concatenate( - axis=-1, name='weather_concat_layer', dtype=layer_params['dtype'] + axis=-1, name="weather_concat_layer", dtype=layer_params["dtype"] )(weather_inputs) wm = layers.Conv1D( - filters=16, # reasonable range is 4..32 + filters=16, # reasonable range is 4..32 kernel_size=4, - padding='same', - data_format='channels_last', - name='first_1dconv', - **layer_params + padding="same", + data_format="channels_last", + name="first_1dconv", + **layer_params, )(wm) # Performance with only one layer of CNN is abismal. # Use at least one more layer wm = layers.Conv1D( filters=16, kernel_size=4, - padding='same', - data_format='channels_last', - name='last_1dconv', + padding="same", + data_format="channels_last", + name="last_1dconv", # activation=gaussian_activation, - **layer_params + **layer_params, )(wm) # sum the time dimension - wm = layers.Lambda( - lambda x: K.sum(x, axis=1), dtype=layer_params['dtype'])(wm) + wm = layers.Lambda(lambda x: K.sum(x, axis=1), dtype=layer_params["dtype"])(wm) wmo = models.Model( - inputs=weather_inputs_dict, outputs=wm, name='weather_features_model') + inputs=weather_inputs_dict, outputs=wm, name="weather_features_model" + ) return weather_inputs_dict, wmo def create_combined_model(train_gen, bmo, wmo, layer_params): combined_inputs_dict = { - 'building_embedding': layers.Input( - name='building_embedding', shape=(bmo.output.shape[1],), - dtype=layer_params['dtype']), - 'weather_embedding': layers.Input( - name='weather_embedding', shape=(wmo.output.shape[1],), - dtype=layer_params['dtype']), + "building_embedding": layers.Input( + name="building_embedding", + shape=(bmo.output.shape[1],), + dtype=layer_params["dtype"], + ), + "weather_embedding": layers.Input( + name="weather_embedding", + shape=(wmo.output.shape[1],), + dtype=layer_params["dtype"], + ), } combined_inputs = list(combined_inputs_dict.values()) - cm = layers.Concatenate(name='combine_features')(combined_inputs) + cm = layers.Concatenate(name="combine_features")(combined_inputs) cm = layers.Dense(16, **layer_params)(cm) cm = layers.Dense(16, **layer_params)(cm) @@ -147,21 +163,21 @@ def create_combined_model(train_gen, bmo, wmo, layer_params): # would be a dict if these outputs were final combined_outputs = {} for consumption_group in train_gen.consumption_groups: - io = layers.Dense(8, name=consumption_group+'_entry', **layer_params)(cm) + io = layers.Dense(8, name=consumption_group + "_entry", **layer_params)(cm) # ... feel free to add more layers - io = layers.Dense(8, name=consumption_group+'_mid', **layer_params)(io) + io = layers.Dense(8, name=consumption_group + "_mid", **layer_params)(io) # no activation on the output io = layers.Dense(1, name=consumption_group, **layer_params)(io) combined_outputs[consumption_group] = io combined_model = models.Model( - inputs=combined_inputs_dict, outputs=combined_outputs, - name='combined_model') + inputs=combined_inputs_dict, outputs=combined_outputs, name="combined_model" + ) return combined_inputs_dict, combined_model def create_model(layer_params=None): - """ End to end model architecture definition + """End to end model architecture definition Model config should include: - datagen config @@ -188,26 +204,31 @@ def create_model(layer_params=None): # Combined model and separate towers for output groups combined_inputs_dict, combined_model = create_combined_model( - train_gen, bmo, wmo, layer_params) + train_gen, bmo, wmo, layer_params + ) building_embedding = bmo(bmo_inputs_dict) weather_embedding = wmo(weather_inputs_dict) - combined_output = combined_model({ - 'building_embedding': building_embedding, - 'weather_embedding': weather_embedding - }) + combined_output = combined_model( + { + "building_embedding": building_embedding, + "weather_embedding": weather_embedding, + } + ) final_model = models.Model( inputs=itertools.ChainMap(bmo_inputs_dict, weather_inputs_dict), - outputs=combined_output + outputs=combined_output, ) - final_model.compile(loss=keras.losses.MeanAbsoluteError(), optimizer='adam') + final_model.compile(loss=keras.losses.MeanAbsoluteError(), optimizer="adam") # return final_model history = final_model.fit( - train_gen, epochs=100, validation_data=test_gen, - callbacks=[keras.callbacks.EarlyStopping(monitor='loss', patience=5)] + train_gen, + epochs=100, + validation_data=test_gen, + callbacks=[keras.callbacks.EarlyStopping(monitor="loss", patience=5)], ) # Experimental: fix weather embeddings and continue training @@ -216,31 +237,39 @@ def create_model(layer_params=None): replace_weather_with_embeddings(train_gen, wmo) replace_weather_with_embeddings(test_gen, wmo) - combined_output2 = combined_model({ - 'building_embedding': building_embedding, - 'weather_embedding': combined_inputs_dict['weather_embedding'] - }) + combined_output2 = combined_model( + { + "building_embedding": building_embedding, + "weather_embedding": combined_inputs_dict["weather_embedding"], + } + ) - final_model2 = models.Model(inputs=itertools.ChainMap(bmo_inputs_dict, { - 'weather_embedding': combined_inputs_dict['weather_embedding'] - }), outputs=combined_output2) - final_model2.compile(loss=keras.losses.MeanAbsoluteError(),optimizer='adam') + final_model2 = models.Model( + inputs=itertools.ChainMap( + bmo_inputs_dict, + {"weather_embedding": combined_inputs_dict["weather_embedding"]}, + ), + outputs=combined_output2, + ) + final_model2.compile(loss=keras.losses.MeanAbsoluteError(), optimizer="adam") history2 = final_model2.fit( - train_gen, epochs=200, validation_data=test_gen, - callbacks=[keras.callbacks.EarlyStopping(monitor='loss', patience=10)] + train_gen, + epochs=200, + validation_data=test_gen, + callbacks=[keras.callbacks.EarlyStopping(monitor="loss", patience=10)], ) return final_model2, wmo def plot_history(history): - plt.plot(history.history['loss']) - plt.plot(history.history['val_loss']) + plt.plot(history.history["loss"]) + plt.plot(history.history["val_loss"]) - plt.title('model loss') - plt.ylabel('loss') - plt.xlabel('epoch') - plt.legend(['train', 'val'], loc='upper left') + plt.title("model loss") + plt.ylabel("loss") + plt.xlabel("epoch") + plt.legend(["train", "val"], loc="upper left") plt.show() @@ -248,69 +277,87 @@ def debug_scatterplots(gen, final_model): gt = np.empty((len(gen.ids), len(gen.consumption_groups))) for batch_num in range(len(gen)): _, batch_gt = gen[batch_num] - batch_gt = batch_gt['outputs'] - gt[gen.batch_size * batch_num:gen.batch_size * batch_num + len(batch_gt)] = batch_gt.sum(axis=-1) - predictions = final_model.predict(gen)['outputs'] - groups = { - 'gt': gt, - 'pred': predictions - } - df = pd.DataFrame({ - group + '_' + consumption : groups[group][:, colnum] - for group, (colnum, consumption) in itertools.product(groups, enumerate(gen.consumption_groups)) - }) + batch_gt = batch_gt["outputs"] + gt[gen.batch_size * batch_num : gen.batch_size * batch_num + len(batch_gt)] = ( + batch_gt.sum(axis=-1) + ) + predictions = final_model.predict(gen)["outputs"] + groups = {"gt": gt, "pred": predictions} + df = pd.DataFrame( + { + group + "_" + consumption: groups[group][:, colnum] + for group, (colnum, consumption) in itertools.product( + groups, enumerate(gen.consumption_groups) + ) + } + ) for consumption_group in gen.consumption_groups: - df.plot.scatter(*(group+'_'+consumption_group for group in groups)) + df.plot.scatter(*(group + "_" + consumption_group for group in groups)) # consider checking df.corr() def main(): layer_params = { - 'activation': 'leaky_relu', - 'dtype': np.float32, + "activation": "leaky_relu", + "dtype": np.float32, } final_model = create_model(layer_params) model_architecture_img = keras.utils.plot_model( - final_model, to_file="model.png", show_shapes=True, show_dtype=True, - rankdir="TB", dpi=200, show_layer_activations=True, + final_model, + to_file="model.png", + show_shapes=True, + show_dtype=True, + rankdir="TB", + dpi=200, + show_layer_activations=True, ) get_building_metadata = datagen.BuildingMetadataBuilder() datagen_params = { - 'metadata_builder': get_building_metadata, - 'batch_size': 64, + "metadata_builder": get_building_metadata, + "batch_size": 64, # 'consumption_groups': ( # 'heating', 'cooling', # 'lighting', 'other', # ), - 'weather_features': ( - 'temp_air', 'ghi', 'wind_speed', + "weather_features": ( + "temp_air", + "ghi", + "wind_speed", # 'weekend', 'hour', # 'relative_humidity', 'dni', 'diffuse_horizontal_illum', # 'wind_direction', ), - 'building_features': ( - 'sqft', 'bedrooms', 'stories', 'occupants', 'age2000', 'county', - 'infiltration_ach50', 'insulation_wall', 'insulation_ceiling_roof', - 'cooling_efficiency_eer', 'heating_efficiency', - - 'cooling_setpoint', 'heating_setpoint', + "building_features": ( + "sqft", + "bedrooms", + "stories", + "occupants", + "age2000", + "county", + "infiltration_ach50", + "insulation_wall", + "insulation_ceiling_roof", + "cooling_efficiency_eer", + "heating_efficiency", + "cooling_setpoint", + "heating_setpoint", # 'insulation_slab', 'insulation_rim_joist', 'insulation_floor', # 'orientation', 'window_area', # 'lighting_efficiency', - - # categorical # 'foundation_type', 'windows_type', ), - 'dtype': layer_params['dtype'], + "dtype": layer_params["dtype"], # 'time_granularity': 'M', } train_gen, test_gen = create_dataset(datagen_params) history = final_model.fit( - train_gen, epochs=100, validation_data=test_gen, - callbacks=[keras.callbacks.EarlyStopping(monitor='loss', patience=5)] + train_gen, + epochs=100, + validation_data=test_gen, + callbacks=[keras.callbacks.EarlyStopping(monitor="loss", patience=5)], ) plot_history(history) debug_scatterplots(test_gen, final_model) diff --git a/deprecated/infra_agnostic/utils.py b/deprecated/infra_agnostic/utils.py index 09a4d66..6eda76b 100644 --- a/deprecated/infra_agnostic/utils.py +++ b/deprecated/infra_agnostic/utils.py @@ -7,7 +7,7 @@ def file_cache(cache_path=None): - """ A file caching decorator + """A file caching decorator Assumptions about the underlying function: - all arguments are positional, and are strings @@ -49,9 +49,8 @@ def __init__(self, function): def __call__(self, *args): # function name is to have a meaningful key even with empty args - arg_key = '__'.join(str(arg) for arg in (self.f.__name__, *args)) - cache_fpath = os.path.join( - self.func_cache_path, arg_key + '.pq') + arg_key = "__".join(str(arg) for arg in (self.f.__name__, *args)) + cache_fpath = os.path.join(self.func_cache_path, arg_key + ".pq") if os.path.exists(cache_fpath): return pd.read_parquet(cache_fpath) res = self.f(*args) @@ -59,7 +58,7 @@ def __call__(self, *args): return res def reset_cache(self): - for fpath in glob.glob(os.path.join(self.func_cache_path, '*')): + for fpath in glob.glob(os.path.join(self.func_cache_path, "*")): os.remove(fpath) return Decorator diff --git a/scripts/build_feature_store.py b/scripts/build_feature_store.py index 2e9c91e..bd4d0bd 100644 --- a/scripts/build_feature_store.py +++ b/scripts/build_feature_store.py @@ -131,6 +131,7 @@ # COMMAND ---------- + # DBTITLE 1,Helper functions @udf(returnType=DoubleType()) def extract_percentage(value: str) -> float: @@ -429,6 +430,7 @@ def get_water_heater_capacity_ashrae( ] ) + # pulled from options.tsv @udf(wh_schema) def get_water_heater_specs(name: str) -> StructType: @@ -537,8 +539,10 @@ def add_water_heater_features(df): .drop("wh_struct") ) + # COMMAND ---------- + # DBTITLE 1,Mapping Expressions # Make various mapping expressions def make_map_type_from_dict(mapping: Dict) -> Column: @@ -577,6 +581,7 @@ def make_map_type_from_dict(mapping: Dict) -> Column: # COMMAND ---------- + # DBTITLE 1,Building metadata feature transformation function def transform_building_features() -> DataFrame: """ @@ -933,6 +938,7 @@ def transform_building_features() -> DataFrame: ) return building_metadata_transformed + # COMMAND ---------- # DBTITLE 1,Transform building metadata @@ -1164,6 +1170,7 @@ def apply_upgrades(baseline_building_features: DataFrame, upgrade_id: int) -> Da return upgrade_building_features + # COMMAND ---------- # DBTITLE 1,Apply upgrade logic to baseline features @@ -1277,6 +1284,7 @@ def apply_upgrades(baseline_building_features: DataFrame, upgrade_id: int) -> Da # COMMAND ---------- + # DBTITLE 1,Weather feature transformation function def transform_weather_features() -> DataFrame: """ @@ -1289,17 +1297,16 @@ def transform_weather_features() -> DataFrame: weather_df = spark.read.table("ml.surrogate_model.weather_data_hourly") weather_pkeys = ["weather_file_city"] - weather_data_arrays = ( - weather_df.groupBy(weather_pkeys).agg( + weather_data_arrays = weather_df.groupBy(weather_pkeys).agg( *[ F.collect_list(c).alias(c) for c in weather_df.columns if c not in weather_pkeys + ["datetime_formatted"] ] ) - ) return weather_data_arrays + # COMMAND ---------- # DBTITLE 1,Transform weather features @@ -1310,22 +1317,23 @@ def transform_weather_features() -> DataFrame: # DBTITLE 1,Create and apply string indexer to generate weather file city index # Create the StringIndexer indexer = StringIndexer( - inputCol="weather_file_city", - outputCol="weather_file_city_index", - stringOrderType="alphabetAsc" + inputCol="weather_file_city", + outputCol="weather_file_city_index", + stringOrderType="alphabetAsc", ) weather_file_city_indexer = indexer.fit(weather_data_transformed) -weather_data_indexed = ( - weather_file_city_indexer.transform(weather_data_transformed) - .withColumn('weather_file_city_index', F.col('weather_file_city_index').cast('int')) -) +weather_data_indexed = weather_file_city_indexer.transform( + weather_data_transformed +).withColumn("weather_file_city_index", F.col("weather_file_city_index").cast("int")) building_metadata_applicable_upgrades_with_weather_file_city_index = ( - weather_file_city_indexer.transform(building_metadata_applicable_upgrades) - .withColumn('weather_file_city_index', F.col('weather_file_city_index').cast('int')) - + weather_file_city_indexer.transform( + building_metadata_applicable_upgrades + ).withColumn( + "weather_file_city_index", F.col("weather_file_city_index").cast("int") + ) ) # COMMAND ---------- @@ -1409,5 +1417,3 @@ def transform_weather_features() -> DataFrame: ) # COMMAND ---------- - - diff --git a/scripts/extract_data.py b/scripts/extract_data.py index af6be44..22913ac 100644 --- a/scripts/extract_data.py +++ b/scripts/extract_data.py @@ -1,8 +1,8 @@ # Databricks notebook source -# MAGIC %md # Extract Raw Dataset for Surrogate Model +# MAGIC %md # Extract Raw Dataset for Surrogate Model # MAGIC # MAGIC ### Goal -# MAGIC Extract and collect the raw ResStock EUSS data required for surrogate modeling, do some light pre-processing to prep for feature engineering, and write to a Delta Table. +# MAGIC Extract and collect the raw ResStock EUSS data required for surrogate modeling, do some light pre-processing to prep for feature engineering, and write to a Delta Table. # MAGIC # MAGIC ### Process # MAGIC * Extract and lightly preprocess various ResStock data @@ -13,14 +13,14 @@ # MAGIC # MAGIC ### I/Os # MAGIC -# MAGIC ##### Inputs: +# MAGIC ##### Inputs: # MAGIC Let `RESSTOCK_PATH = gs://the-cube/data/raw/nrel/end_use_load_profiles/2022/` # MAGIC - `RESSTOCK_PATH/metadata_and_annual_results/national/parquet/baseline_metadata_only.parquet` : Parquet file of building metadata (building id [550K] x building metadata variable) # MAGIC - `RESSTOCK_PATH/metadata_and_annual_results/national/parquet/*_metadata_and_annual_results.parquet`: Parquet file of annual building model simulation outputs (building id [~550K], upgrade_id [11] x output variable) -# MAGIC - `RESSTOCK_PATH/weather/state=*/*_TMY3.csv`: 3107 weather csvs for each county (hour [8760] x weather variable). -# MAGIC Note that counties corresponding to the same weather station have identical data. +# MAGIC - `RESSTOCK_PATH/weather/state=*/*_TMY3.csv`: 3107 weather csvs for each county (hour [8760] x weather variable). +# MAGIC Note that counties corresponding to the same weather station have identical data. # MAGIC -# MAGIC ##### Outputs: +# MAGIC ##### Outputs: # MAGIC - `ml.surrogate_model.building_metadata`: Building metadata indexed by (building_id) # MAGIC - `ml.surrogate_model.building_simulation_outputs_annual`: Annual building model simulation outputs indexed by (building_id, upgrade_id) # MAGIC - `ml.surrogate_model.weather_data_hourly`: Hourly weather data indexed by (weather_file_city, hour datetime) @@ -70,6 +70,7 @@ # COMMAND ---------- + # DBTITLE 1,Functions for loading and preprocessing raw data def transform_pkeys(df): return ( @@ -240,6 +241,7 @@ def extract_hourly_weather_data(): ) return weather_data + # COMMAND ---------- # DBTITLE 1,Extract building metadata diff --git a/scripts/gpu_usage.py b/scripts/gpu_usage.py index 1837580..395f31f 100644 --- a/scripts/gpu_usage.py +++ b/scripts/gpu_usage.py @@ -58,7 +58,7 @@ df_usage = pd.concat([df_usage, pd.DataFrame([row])]) data_cols = [col for col in df_usage.columns if col != "timestamp"] - labels = [label_map[col] for col in data_cols] + labels = [label_map[col] for col in data_cols] if lines is None: lines = ax.plot( @@ -93,7 +93,7 @@ fig, ax = plt.subplots(figsize=(6, 4)) ax.set_title(", ".join(labels) + " Utilization") - + lines = ax.plot( df_usage["timestamp"], df_usage[data_cols], @@ -108,5 +108,3 @@ ax.grid() # COMMAND ---------- - - diff --git a/scripts/model_evaluation.py b/scripts/model_evaluation.py index e37ec50..7fe81a3 100644 --- a/scripts/model_evaluation.py +++ b/scripts/model_evaluation.py @@ -251,8 +251,8 @@ "water_heater_fuel", "clothes_dryer_fuel", "cooking_range_fuel", - # "is_mobile_home", - # "is_attached", + # "is_mobile_home", + # "is_attached", # "unit_level_in_building" ] pred_by_building_upgrade_fuel_model_with_metadata = test_set.select( @@ -269,8 +269,9 @@ .otherwise(F.col("heating_fuel")), ).withColumn( "ac_type", - F.when(F.col("ac_type") == "None", F.lit("No Cooling")) - .otherwise(F.col("ac_type")), + F.when(F.col("ac_type") == "None", F.lit("No Cooling")).otherwise( + F.col("ac_type") + ), ) ) @@ -280,6 +281,7 @@ # COMMAND ---------- + # DBTITLE 1,Calculate error metrics # define function to calculate absolute prediction error @udf("double") @@ -357,6 +359,7 @@ def APE(abs_error: float, actual: float, eps=1e-3): # COMMAND ---------- + # DBTITLE 1,Define function for aggregating over metrics # define function to calculate absolute prediction error def wMAPE(abs_error_col: Column, actual_col: Column) -> Column: @@ -410,6 +413,7 @@ def aggregate_metrics(pred_df_savings: DataFrame, groupby_cols: List[str]): return pred_df_savings.groupby(*groupby_cols).agg(*aggregation_expression) + # COMMAND ---------- # DBTITLE 1,Calculate aggregated metrics with various groupings @@ -607,7 +611,6 @@ def aggregate_metrics(pred_df_savings: DataFrame, groupby_cols: List[str]): pred_df_savings_pd_clip = pred_df_savings_pd.copy() with sns.axes_style("whitegrid"): - g = sns.catplot( data=pred_df_savings_pd_clip, x="Baseline Fuel", @@ -679,6 +682,7 @@ def save_figure_to_gcfs(fig, gcspath, figure_format="png", dpi=200, transparent= blob = bucket.blob(gcspath.blob) blob.upload_from_file(buf, content_type=figure_format, rewind=True) + # COMMAND ---------- # DBTITLE 1,Write out figure diff --git a/scripts/model_training.py b/scripts/model_training.py index c86ddb0..47940e2 100644 --- a/scripts/model_training.py +++ b/scripts/model_training.py @@ -12,13 +12,13 @@ # MAGIC # MAGIC ### I/Os # MAGIC -# MAGIC ##### Inputs: +# MAGIC ##### Inputs: # MAGIC - `ml.surrogate_model.building_metadata`: Building metadata features indexed by (building_id) # MAGIC - `ml.surrogate_model.weather_data_hourly`: Weather data indexed by (weather_file_city) with a 8760-length timeseries vector # MAGIC - `ml.surrogate_model.building_upgrade_simulation_outputs_annual`: Annual building model simulation outputs indexed by (building_id, upgrade_id) # MAGIC -# MAGIC ##### Outputs: -# MAGIC None. The model is logged to the unity catalog with the run id, but as of now is not registered due to issue with signature enforcement slowing down inference. +# MAGIC ##### Outputs: +# MAGIC None. The model is logged to the unity catalog with the run id, but as of now is not registered due to issue with signature enforcement slowing down inference. # MAGIC # MAGIC ### TODOs: # MAGIC @@ -34,7 +34,7 @@ # MAGIC #### Cluster/ User Requirements # MAGIC - Access Mode: Single User or Shared (Not No Isolation Shared) # MAGIC - Runtime: >= Databricks Runtime 14.3 ML (or >= Databricks Runtime 14.3 + `%pip install databricks-feature-engineering`) -# MAGIC - Node type: Single Node. Because of [this issue](https://kb.databricks.com/en_US/libraries/apache-spark-jobs-fail-with-environment-directory-not-found-error), worker nodes cannot access the directory needed to run inference on a keras trained model, meaning that the `score_batch()` function throws and OSError. +# MAGIC - Node type: Single Node. Because of [this issue](https://kb.databricks.com/en_US/libraries/apache-spark-jobs-fail-with-environment-directory-not-found-error), worker nodes cannot access the directory needed to run inference on a keras trained model, meaning that the `score_batch()` function throws and OSError. # MAGIC - Can be run on CPU or GPU, with 2x speedup on GPU # MAGIC - Cluster-level packages: `gcsfs==2023.5.0`, `mlflow==2.13.0` (newer than default, which is required to pass a `code_paths` in logging) # MAGIC - `USE CATALOG`, `CREATE SCHEMA` privleges on the `ml` Unity Catalog (Ask Miki if for access) @@ -116,6 +116,7 @@ # COMMAND ---------- + # DBTITLE 1,Define wrapper class for processing at inference time class SurrogateModelingWrapper(mlflow.pyfunc.PythonModel): """ @@ -163,7 +164,7 @@ def postprocess_result( Parameters: - results (dict of {str: np.ndarray}): The outputs of the model in format {target_name (str) : np.ndarray [N,]} - - feature_df (pd.DataFrame): The features for the samples of shape [N, *]. Only the features flagging which fuels are present are used here. + - feature_df (pd.DataFrame): The features for the samples of shape [N, *]. Only the features flagging which fuels are present are used here. Returns: - np.ndarray of shape [N, M] @@ -211,7 +212,11 @@ def convert_feature_dataframe_to_dict( - The preprocessed feature data in format {feature_name (str) : np.array of shape [N] for building model features and shape [N,8760] for weather features} """ - return {col: np.array(feature_df[col]) for col in self.building_features + ['weather_file_city_index']} + return { + col: np.array(feature_df[col]) + for col in self.building_features + ["weather_file_city_index"] + } + # COMMAND ---------- @@ -247,7 +252,6 @@ def convert_feature_dataframe_to_dict( # Starts an MLflow experiment to track training parameters and results. with mlflow.start_run() as run: - # Get the unique ID of the current run in case we aren't registering it run_id = mlflow.active_run().info.run_id @@ -317,5 +321,3 @@ def convert_feature_dataframe_to_dict( dbutils.jobs.taskValues.set(key="run_id", value=run_id) # COMMAND ---------- - - diff --git a/src/datagen.py b/src/datagen.py index 76864a7..c0f5386 100644 --- a/src/datagen.py +++ b/src/datagen.py @@ -1,7 +1,6 @@ import math from typing import Any, Dict, List, Tuple -import mlflow import numpy as np import pandas as pd import pyspark.sql.functions as F @@ -21,7 +20,8 @@ class DataGenerator(tf.keras.utils.Sequence): Let N and M be the number of samples and targets in the training set respectively. Let P_b and P_w be the number of building and weather features respectively, where P = P_b + P_w is the total number of features. - Attributes: + Attributes + ---------- - building_features (List[str]): names of the building features to use in training. Defaults to class attribute. - weather_features (List[str]): names of the weather features to use in training. Defaults to class attribute. - upgrade_ids (List[str]): ids of upgrades to include in training set. Defaults to class attribute. @@ -35,7 +35,7 @@ class DataGenerator(tf.keras.utils.Sequence): - training_set (TrainingSet): Databricks TrainingSet object contaning targets, building feautres and weather features. - training_df (pd.DataFrame): Dataframe of building features and targets of shape [N, P_b + M]. Does not include weather features. - weather_features_df (pd.DataFrame): Dataframe of building features of shape [N, P_w] where each column contains a 8760-length vector. - - weather_features_matrix (numpy.ndarray): A 3D matrix of shape (number of weather file cities, number of weather features, and number of hours in a year) representing weather data for various cities over the course of a year. + - weather_features_matrix (numpy.ndarray): A 3D matrix of shape (number of weather file cities, number of weather features, and number of hours in a year) representing weather data for various cities over the course of a year. - building_feature_vocab_dict (dict): Dict of format {feature_name : {"dtype": feature_dtype, "vocab": np.array of all possible features if string feature else empty}}. - fe (databricks.feature_engineering.client.FeatureEngineeringClient: client for interacting with the @@ -167,11 +167,11 @@ def __init__( """ Initializes the DataGenerator object. - Parameters: + Parameters + ---------- - train_data (DataFrame): the training data containing the targets and keys to join to the feature tables. See class docstring for all other parameters. """ - # self.upgrades = upgrade_ids or self.upgrade_ids self.building_features = building_features or self.building_features self.weather_features = weather_features or self.weather_features @@ -198,8 +198,12 @@ def __init__( self.weather_features_df = self.init_weather_features() self.weather_features_matrix = np.stack( - self.weather_features_df.sort_values(by = 'weather_file_city_index')[self.weather_features].apply(lambda row: np.stack(row), axis=1).values - ) + self.weather_features_df.sort_values(by="weather_file_city_index")[ + self.weather_features + ] + .apply(lambda row: np.stack(row), axis=1) + .values + ) self.building_feature_vocab_dict = self.init_building_feature_vocab_dict() self.on_epoch_end() @@ -208,13 +212,14 @@ def get_building_feature_lookups(self) -> FeatureLookup: """ Returns the FeatureLookup objects for building features. - Returns: + Returns + ------- - list: List of FeatureLookup objects for building features. """ return [ FeatureLookup( table_name=self.building_feature_table_name, - feature_names=self.building_features + ['weather_file_city_index'], + feature_names=self.building_features + ["weather_file_city_index"], lookup_key=["building_id", "upgrade_id", "weather_file_city"], ), ] @@ -223,7 +228,8 @@ def get_weather_feature_lookups(self) -> FeatureLookup: """ Returns the FeatureLookup objects for weather features. - Returns: + Returns + ------- - list: List of FeatureLookup objects for weather features. """ return [ @@ -242,12 +248,14 @@ def init_training_set( """ Initializes the Databricks TrainingSet object contaning targets, building feautres and weather features. - Parameters: + Parameters + ---------- - train_data (DataFrame): the training data containing the targets and keys to join to the feature tables. - exclude_columns (list of str): columns to be excluded from the output training set. Defaults to the join keys: ["building_id", "upgrade_id", "weather_file_city"]. - Returns: + Returns + ------- - TrainingSet """ # Join the feature tables @@ -265,10 +273,12 @@ def init_building_features_and_targets(self, train_data: DataFrame) -> pd.DataFr Loads dataframe containing building features and targets into memory. Note that weather features are not joined until generation time when __get_item__() is called. - Parameters: + Parameters + ---------- - train_data (DataFrame): the training data containing the targets and keys to join to the feature tables. - Returns: + Returns + ------- - pd.DataFrame: dataframe containing building features and targets. """ # Join to building feature tables and drop join keys since these aren't features we wanna train on @@ -284,7 +294,8 @@ def init_weather_features(self) -> pd.DataFrame: """ Loads dataframe weather features into memory - Returns: + Returns + ------- - pd.DataFrame: The weather features dataframe. """ weather_features_table = self.fe.read_table( @@ -300,10 +311,12 @@ def feature_dtype(self, feature_name: str) -> Any: Returns the dtype of the feature, which is tf.string if object, otherwise self.dtype - Parameters: + Parameters + ---------- - feature_name (str): the name of the feature. - Returns: + Returns + ------- - The dtype of the feature, which is tf.string if catagorical """ is_string_feature = self.training_df[feature_name].dtype == "O" @@ -314,10 +327,12 @@ def feature_vocab(self, feature_name: str) -> np.ndarray: Returns the vocabulary of the feature: unique list of possible values a categorical feature can take on (only used for categorical). - Parameters: + Parameters + ---------- - feature_name: str, the name of the feature. - Returns: + Returns + ------- - np.ndarray: The unique list of possible values a categorical feature can take on """ return self.training_df[feature_name].unique() @@ -326,7 +341,8 @@ def init_building_feature_vocab_dict(self) -> Dict[str, Dict[str, Any]]: """ Initializes the building feature vocabulary dictionary. - Returns: + Returns + ------- Dict of format {feature_name : {"dtype": feature_dtype, "vocab": np.array of all possible features if string feature else empty}}. """ bm_dict = {} @@ -344,37 +360,45 @@ def convert_dataframe_to_dict( """ Converts the training features from a pandas dataframe to a dictionary. - Parameters: + Parameters + ---------- - feature_df: pd.DataFrame, the input features for the model of shape [N, P + 1] where feature columns for weather features contain len 8760 arrays. Note the one extra column "in_weather_city" which was used in join and will get dropped here. - Returns: + Returns + ------- Dict[str,np.ndarray]: The preprocessed feature data in format {feature_name (str): np.array of shape [len(feature_df)] for building model features and shape [len(feature_df), 8760] for weather features} """ - return {col: np.array(feature_df[col]) for col in self.building_features + ['weather_file_city_index']} + return { + col: np.array(feature_df[col]) + for col in self.building_features + ["weather_file_city_index"] + } def __len__(self) -> int: """ Returns the number of batches. - Returns: + Returns + ------- - int: The number of batches. """ return math.ceil(len(self.training_df) / self.batch_size) - + def __getitem__( self, index: int ) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]: """ Generates one batch of data. - Parameters: + Parameters + ---------- - index: int, the batch index. - Returns: + Returns + ------- - X (dict): features for batch in format {feature_name (str): np.array of shape [batch_size] for building model features and shape [batch_size, 8760] for weather features} - y (dict) : targets for the batch in format {target_name (str): np.array of shape [batch_size]} @@ -410,20 +434,22 @@ def load_data( and split into train/val/test sets. The parameters n_train and n_test can be used to reduce the size of the data, by subsetting from the existing train/val/test data, meaning that the same splits are preserved. - Parameters: + Parameters + ---------- consumption_group_dict (dict): Dictionary mapping consumption categories (e.g., 'heating') to columns. Default is DataGenerator.consumption_by_fuel_dict (too long to write out) building_feature_table_name (str): Name of the building feature table. Default is "ml.surrogate_model.building_features" p_val (float): Proportion of data to use for validation. Default is 0.2. p_test (float): Proportion of data to use for testing. Default is 0.1. - n_train (int): Number of training records to select, where the size of the val and tests sets will be adjusted accordingly to + n_train (int): Number of training records to select, where the size of the val and tests sets will be adjusted accordingly to maintain the requested ratios. If number is passed that exceeds the size of p_train * all samples, then this will just be set to that max value. Default is None (select all) - n_test (int): Number of test records to select, where the size of the train and val sets will be adjusted accordingly to maintain + n_test (int): Number of test records to select, where the size of the train and val sets will be adjusted accordingly to maintain the requested ratios. If number is passed that exceeds the size of p_test * all_samples, then this will just be set to that max value. Default is None (select all). seed (int): Seed for random sampling. Default is 42. - Returns: + Returns + ------- train data (DataFrame) val_data (DataFrame) test_data (DataFrame) @@ -465,9 +491,11 @@ def load_data( ) if n_train: - frac = np.clip(n_train * p_baseline / train_ids.count(), a_max = 1.0, a_min=0.0) + frac = np.clip( + n_train * p_baseline / train_ids.count(), a_max=1.0, a_min=0.0 + ) elif n_test: - frac = np.clip(n_test * p_baseline / test_ids.count(), a_max = 1.0, a_min=0.0) + frac = np.clip(n_test * p_baseline / test_ids.count(), a_max=1.0, a_min=0.0) else: frac = 1.0 @@ -476,4 +504,4 @@ def load_data( val_df = val_ids.sample(fraction=frac, seed=0).join(data, on="building_id") test_df = test_ids.sample(fraction=frac, seed=0).join(data, on="building_id") - return train_df, val_df, test_df \ No newline at end of file + return train_df, val_df, test_df diff --git a/src/gpuutils.py b/src/gpuutils.py index d6aeb9b..d87a985 100644 --- a/src/gpuutils.py +++ b/src/gpuutils.py @@ -23,7 +23,6 @@ def gather_cpu_gpu_metrics( log_interval: Optional[int] = None, ): """Gather GPU performance at regular intervals.""" - if not isinstance(interval, pd.Timedelta): interval = pd.Timedelta(interval, unit="seconds") @@ -66,7 +65,7 @@ def gather_cpu_gpu_metrics( gpu_utilization = utilization.gpu gpu_memory_utilization = utilization.memory - + utilization_dict.update( { "gpu_utilization": gpu_utilization, diff --git a/src/surrogate_model.py b/src/surrogate_model.py index c530811..15f3270 100644 --- a/src/surrogate_model.py +++ b/src/surrogate_model.py @@ -17,7 +17,8 @@ class SurrogateModel: """ A Deep Learning model for surrogate modeling energy consumption prediction. - Attributes: + Attributes + ---------- - name (str): the name of the model. - batch_size (int): the batch size for training. Defaults to 64. - dtype (np.dtype), the data type for the numeric features in the model. Defaults to np.float32. @@ -62,23 +63,34 @@ def create_model( """ Create a keras model based on the given data generator and layer parameters. - Parameters: + Parameters + ---------- - train_gen (DataGenerator):, the data generator object for training. - layer_params (Dict[str, Any]): the layer parameters for the model. - Returns: + Returns + ------- - tensorflow.keras.src.engine.functional.Functional: the created keras model """ - #Dense-BatchNorm-LeakyReLU block - def dense_batchnorm_leakyrelu(x:tf.keras.layers, n_units:int, name:str, **layer_params): + + # Dense-BatchNorm-LeakyReLU block + def dense_batchnorm_leakyrelu( + x: tf.keras.layers, n_units: int, name: str, **layer_params + ): x = layers.Dense(n_units, name=f"{name}_dense", **layer_params)(x) x = layers.BatchNormalization(name=f"{name}_batchnorm")(x) x = layers.LeakyReLU(name=f"{name}_leakyrelu")(x) return x - - #Conv-BatchNorm-LeakyReLU block - def conv_batchnorm_relu(x:tf.keras.layers, filters:int, kernel_size:int, name:str, **layer_params): + + # Conv-BatchNorm-LeakyReLU block + def conv_batchnorm_relu( + x: tf.keras.layers, + filters: int, + kernel_size: int, + name: str, + **layer_params, + ): x = layers.Conv1D( filters=filters, kernel_size=kernel_size, @@ -90,7 +102,7 @@ def conv_batchnorm_relu(x:tf.keras.layers, filters:int, kernel_size:int, name:st x = layers.BatchNormalization(name=f"{name}_conv_batchnorm")(x) x = layers.LeakyReLU(name=f"{name}_conv_leakyrelu")(x) return x - + # Building metadata model bmo_inputs_dict = { building_feature: layers.Input( @@ -110,7 +122,9 @@ def conv_batchnorm_relu(x:tf.keras.layers, filters:int, kernel_size:int, name:st output_mode="one_hot", dtype=layer_params["dtype"], ) - vocab_tensor = tf.convert_to_tensor(train_gen.building_feature_vocab_dict[feature]["vocab"]) + vocab_tensor = tf.convert_to_tensor( + train_gen.building_feature_vocab_dict[feature]["vocab"] + ) encoder.adapt(vocab_tensor) layer = encoder(layer) bmo_inputs.append(layer) @@ -120,10 +134,10 @@ def conv_batchnorm_relu(x:tf.keras.layers, filters:int, kernel_size:int, name:st ) bm = layers.BatchNormalization(name="init_batchnorm")(bm) - bm = dense_batchnorm_leakyrelu(bm, n_units = 128, name = "first") - bm = dense_batchnorm_leakyrelu(bm, n_units = 64, name = "second") - bm = dense_batchnorm_leakyrelu(bm, n_units = 32, name = "third") - bm = dense_batchnorm_leakyrelu(bm, n_units = 16, name = "fourth") + bm = dense_batchnorm_leakyrelu(bm, n_units=128, name="first") + bm = dense_batchnorm_leakyrelu(bm, n_units=64, name="second") + bm = dense_batchnorm_leakyrelu(bm, n_units=32, name="third") + bm = dense_batchnorm_leakyrelu(bm, n_units=16, name="fourth") bmo = models.Model( inputs=bmo_inputs_dict, outputs=bm, name="building_features_model" @@ -135,25 +149,33 @@ def conv_batchnorm_relu(x:tf.keras.layers, filters:int, kernel_size:int, name:st num_cities, num_features, num_hours = train_gen.weather_features_matrix.shape # Input for the weather_file_city_index (lookup key) - weather_file_city_index_input = layers.Input(shape=(1,), dtype='int32', name='weather_file_city_index') + weather_file_city_index_input = layers.Input( + shape=(1,), dtype="int32", name="weather_file_city_index" + ) # Create weather embedding layer weather_embedding_layer = layers.Embedding( input_dim=num_cities, output_dim=num_hours * num_features, - weights=[train_gen.weather_features_matrix.reshape(num_cities, num_hours * num_features)], - trainable=False, name='weather_embedding')(weather_file_city_index_input) - + weights=[ + train_gen.weather_features_matrix.reshape( + num_cities, num_hours * num_features + ) + ], + trainable=False, + name="weather_embedding", + )(weather_file_city_index_input) + # Reshape weather embedding layer wm = layers.Reshape((num_features, num_hours))(weather_embedding_layer) # Apply transpose using a Lambda layer - wm = layers.Lambda(lambda x: tf.transpose(x, perm=[0,2,1]))(wm) + wm = layers.Lambda(lambda x: tf.transpose(x, perm=[0, 2, 1]))(wm) # Proceed with batch normalization and convolutions wm = layers.BatchNormalization(name="init_conv_batchnorm")(wm) - wm = conv_batchnorm_relu(wm, filters=16, kernel_size=8, name = "first") - wm = conv_batchnorm_relu(wm, filters=8, kernel_size=8, name = "second") + wm = conv_batchnorm_relu(wm, filters=16, kernel_size=8, name="first") + wm = conv_batchnorm_relu(wm, filters=8, kernel_size=8, name="second") # sum the time dimension wm = layers.Lambda( @@ -163,25 +185,47 @@ def conv_batchnorm_relu(x:tf.keras.layers, filters:int, kernel_size:int, name:st )(wm) wmo = models.Model( - inputs=weather_file_city_index_input, outputs=wm, name="weather_features_model" + inputs=weather_file_city_index_input, + outputs=wm, + name="weather_features_model", ) # Combined model and separate towers for output groups cm = layers.Concatenate(name="combine")([bmo.output, wmo.output]) - cm = layers.Dense(24, name="combine_first_dense", activation="leaky_relu", **layer_params)(cm) - cm = layers.Dense(24, name="combine_second_dense", activation="leaky_relu", **layer_params)(cm) - cm = layers.Dense(16, name="third_second_dense", activation="leaky_relu", **layer_params)(cm) + cm = layers.Dense( + 24, name="combine_first_dense", activation="leaky_relu", **layer_params + )(cm) + cm = layers.Dense( + 24, name="combine_second_dense", activation="leaky_relu", **layer_params + )(cm) + cm = layers.Dense( + 16, name="third_second_dense", activation="leaky_relu", **layer_params + )(cm) # building a separate tower for each output group final_outputs = {} for consumption_group in train_gen.targets: - io = layers.Dense(4, name=consumption_group + "_entry", activation="leaky_relu", **layer_params)(cm) - io = layers.Dense(2, name=consumption_group + "_mid", activation="leaky_relu", **layer_params)(io) + io = layers.Dense( + 4, + name=consumption_group + "_entry", + activation="leaky_relu", + **layer_params, + )(cm) + io = layers.Dense( + 2, + name=consumption_group + "_mid", + activation="leaky_relu", + **layer_params, + )(io) io = layers.Dense(1, name=consumption_group, activation="leaky_relu")(io) final_outputs[consumption_group] = io final_model = models.Model( - inputs={**bmo.input, 'weather_file_city_index': weather_file_city_index_input}, outputs=final_outputs + inputs={ + **bmo.input, + "weather_file_city_index": weather_file_city_index_input, + }, + outputs=final_outputs, ) final_model.compile( @@ -195,7 +239,8 @@ def get_latest_model_version(self) -> int: """ Returns the latest version of the registered model. - Returns: + Returns + ------- - int, the latest version of the registered model """ @@ -213,10 +258,12 @@ def get_latest_registered_model_uri(self, verbose: bool = True) -> str: """ Returns the URI for the latest version of the registered model. - Raises: + Raises + ------ - ValueError: If no version of the model has been registered yet - Returns: + Returns + ------- - str: the URI for the latest version of the registered model """ @@ -239,14 +286,17 @@ def get_model_uri( * the model version if specified * the latest registered model otherwise - Raises: + Raises + ------ - ValueError: If no run_id is not passed and no version of the model has been registered yet - Parameters: + Parameters + ---------- - run_id (str): the ID of the run. Defaults to None. - version (int): the version of the model. Ignored if run_id is passed. Defaults to None. - Returns: + Returns + ------- - str, the URI for the specified model version or the latest registered model """ @@ -269,12 +319,14 @@ def score_batch( * the latest registered model otherwise Returns the input dataframe with a column containing predicted values as an array (one for each target) - Parameters: + Parameters + ---------- - test_data (DataFrame): the test data to run inference on containing the keys to join to feature tables on. - run_id (str): the ID of the run. Defaults to None. - version (int): the version of the model. Ignored if run_id is passed. Defaults to None. - Returns: + Returns + ------- - DataFrame: test data with predictions """ @@ -291,11 +343,13 @@ def mape(y_true, y_pred): Computes the Mean Absolute Percentage Error between the true and predicted values, ignoring elements where the true value is 0. - Parameters: + Parameters + ---------- - y_true (array): the true values - y_pred (array): the predicted values - Returns: + Returns + ------- - float: the Mean Absolute Percentage Error """ @@ -304,18 +358,19 @@ def mape(y_true, y_pred): @keras.saving.register_keras_serializable(package="my_package", name="masked_mae") -def masked_mae(y_true:tf.Tensor, y_pred:tf.Tensor) -> tf.Tensor: +def masked_mae(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: """ Calculate the Mean Absolute Error (MAE) between true and predicted values, ignoring those where y_true=0. - This custom loss function is designed for scenarios where zero values in the true values are considered to be irrelevant and should not contribute to the loss calculation. It applies a mask to both the true and predicted values to exclude these zero entries before computing the MAE. The decorator allows this function to be serialized and logged alongside the keras model. + This custom loss function is designed for scenarios where zero values in the true values are considered to be irrelevant and should not contribute to the loss calculation. It applies a mask to both the true and predicted values to exclude these zero entries before computing the MAE. The decorator allows this function to be serialized and logged alongside the keras model. Args: - y_true (tf.Tensor): The true values. - y_pred (tf.Tensor): The predicted values. - Returns: - - tf.Tensor: The mean absolute error computed over non-zero true values. This is just a single scalar stored in a tensor. + Returns + ------- + - tf.Tensor: The mean absolute error computed over non-zero true values. This is just a single scalar stored in a tensor. """ # Create a mask where targets are not zero mask = tf.not_equal(y_true, 0) @@ -330,4 +385,4 @@ def masked_mae(y_true:tf.Tensor, y_pred:tf.Tensor) -> tf.Tensor: return tf.constant(0.0) else: # Calculate the mean absolute error on the masked data - return tf.reduce_mean(tf.abs(y_true_masked - y_pred_masked)) \ No newline at end of file + return tf.reduce_mean(tf.abs(y_true_masked - y_pred_masked))