From 96cdf3deabaa8e961dc8fa2ff7fcabeb6b9126a0 Mon Sep 17 00:00:00 2001 From: Patrick Leary Date: Tue, 6 Feb 2024 23:44:44 -0500 Subject: [PATCH 1/4] consolidating class usage; fix formatting issues --- .flake8 | 2 +- Pipfile | 3 + forms.py | 7 +- generate_thresholds.py | 338 +++++--------------- lib/inat_inferrer.py | 65 ++-- lib/inat_vision_api.py | 44 ++- lib/model_taxonomy.py | 73 ----- lib/model_taxonomy_dataframe.py | 38 ++- lib/model_test_data_export_manager.py | 6 +- lib/model_test_data_exporter.py | 7 +- lib/pt_geo_prior_model.py | 2 +- lib/taxon.py | 30 -- lib/tf_gp_elev_model.py | 6 +- lib/vision_inferrer.py | 4 +- lib/vision_testing.py | 43 ++- requirements.txt | 2 + taxon_range_evaluation.py | 431 ++++++-------------------- tests/test_inat_inferrer.py | 2 +- tests/test_model_taxonomy.py | 54 ---- tests/test_taxon.py | 17 - tests/test_tf_gp_elev_model.py | 2 +- 21 files changed, 328 insertions(+), 848 deletions(-) create mode 100644 Pipfile delete mode 100644 lib/model_taxonomy.py delete mode 100644 lib/taxon.py delete mode 100644 tests/test_model_taxonomy.py delete mode 100644 tests/test_taxon.py diff --git a/.flake8 b/.flake8 index 998a753..8cc6d52 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,7 @@ [flake8] ignore = D203 max-line-length = 100 +inline-quotes = " exclude = .git, __pycache__, @@ -11,4 +12,3 @@ exclude = test-obs*, venv max-complexity = 10 - diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..6c49b0b --- /dev/null +++ b/Pipfile @@ -0,0 +1,3 @@ +[scripts] +tests = "pytest -s" +coverage = "bash -c 'coverage run -m pytest -s && coverage report --show-missing'" diff --git a/forms.py b/forms.py index 420dd8c..26f0371 100644 --- a/forms.py +++ b/forms.py @@ -1,8 +1,11 @@ from flask_wtf import FlaskForm from flask_wtf.file import FileField, FileRequired + class ImageForm(FlaskForm): - image = FileField('image', + image = FileField( + "image", validators=[ FileRequired(message="Please include 'image' field.") - ]) + ] + ) diff --git a/generate_thresholds.py b/generate_thresholds.py index 3f0bd14..47ff87a 100644 --- a/generate_thresholds.py +++ b/generate_thresholds.py @@ -4,218 +4,29 @@ import argparse import tifffile -import os import pandas as pd import numpy as np import h3 -import h3pandas +import h3pandas # noqa: F401 import tensorflow as tf -import csv -import math -import json from tqdm.auto import tqdm -import tensorflow as tf from sklearn.metrics import precision_recall_curve -import matplotlib.pyplot as plt import warnings +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe +from lib.tf_gp_elev_model import TFGeoPriorModelElev -class ResLayer(tf.keras.layers.Layer): - def __init__(self): - super(ResLayer, self).__init__() - self.w1 = tf.keras.layers.Dense( - 256, activation="relu", kernel_initializer="he_normal" - ) - self.w2 = tf.keras.layers.Dense( - 256, activation="relu", kernel_initializer="he_normal" - ) - self.dropout = tf.keras.layers.Dropout(rate=0.5) - self.add = tf.keras.layers.Add() - - def call(self, inputs): - x = self.w1(inputs) - x = self.dropout(x) - x = self.w2(x) - x = self.add([x, inputs]) - return x - - def get_config(self): - return {} - -class Taxon: - - def __init__(self, row): - for key in row: - setattr(self, key, row[key]) - - def set(self, attr, val): - setattr(self, attr, val) - - def is_or_descendant_of(self, taxon): - if self.id == taxon.id: - return True - return self.descendant_of(taxon) - - # using the nested set left and right values, a taxon is a descendant of another - # as long as its left is higher and its right is lower - def descendant_of(self, taxon): - return self.left > taxon.left and self.right < taxon.right - -class ModelTaxonomy: - - def __init__(self, path): - self.load_mapping(path) - self.assign_nested_values() - - def load_mapping(self, path): - self.node_key_to_leaf_class_id = {} - self.leaf_class_to_taxon = {} - # there is no taxon with ID 0, but roots of the taxonomy with have a parent ID of 0, - # so create a fake taxon of Life to represent the root of the entire tree - self.taxa = {0: Taxon({"name": "Life", "depth": 0})} - self.taxon_children = {} - try: - with open(path) as csv_file: - csv_reader = csv.DictReader(csv_file, delimiter=",") - for row in csv_reader: - taxon_id = int(row["taxon_id"]) - rank_level = float(row["rank_level"]) - leaf_class_id = int(row["leaf_class_id"]) if row["leaf_class_id"] else None - parent_id = int(row["parent_taxon_id"]) if row["parent_taxon_id"] else 0 - # some taxa are not leaves and aren't represented in the leaf layer - if leaf_class_id is not None: - self.node_key_to_leaf_class_id[taxon_id] = leaf_class_id - self.leaf_class_to_taxon[leaf_class_id] = taxon_id - self.taxa[taxon_id] = Taxon({ - "id": taxon_id, - "name": row["name"], - "parent_id": parent_id, - "leaf_class_id": leaf_class_id, - "rank_level": rank_level - }) - if parent_id not in self.taxon_children: - self.taxon_children[parent_id] = [] - self.taxon_children[parent_id].append(taxon_id) - except IOError as e: - print(e) - print(f"\n\nCannot open mapping file `{path}`\n\n") - raise e - - # prints to the console a representation of this tree - def print(self, taxon_id=0, ancestor_prefix=""): - children = self.taxon_children[taxon_id] - index = 0 - for child_id in children: - last_in_branch = (index == len(children) - 1) - index += 1 - icon = "└──" if last_in_branch else "├──" - prefixIcon = " " if last_in_branch else "│ " - taxon = self.taxa[child_id] - print(f'{ancestor_prefix}{icon}{taxon.name} :: {taxon.left}:{taxon.right}') - if child_id in self.taxon_children: - self.print(child_id, f"{ancestor_prefix}{prefixIcon}") - - # calculated nested set left and right values and depth representing how many nodes - # down the taxon is from Life. These can be later used for an efficient way to calculate - # if a taxon is a descendant of another - def assign_nested_values(self, taxon_id=0, index=0, depth=1, ancestors=[]): - for child_id in self.taxon_children[taxon_id]: - self.taxa[child_id].set("left", index) - self.taxa[child_id].set("depth", depth) - self.taxa[child_id].set("ancestors", ancestors) - index += 1 - if child_id in self.taxon_children: - child_ancestors = ancestors + [child_id] - index = self.assign_nested_values(child_id, index, depth + 1, child_ancestors) - self.taxa[child_id].set("right", index) - index += 1 - return index - - -class TFGeoPriorModelEnv: - - def __init__(self, model, taxonomy): - self.taxonomy = taxonomy - # initialize the geo model for inference - self.gpmodel = tf.keras.models.load_model( - model, - custom_objects={'ResLayer': ResLayer}, - compile=False - ) - - - def features_for_one_class_elevation(self, latitude, longitude, elevation): - """Evalutes the model for a single class and multiple locations - - Args: - latitude (list): A list of latitudes - longitude (list): A list of longitudes (same length as latitude) - elevation (list): A list of elevations (same length as latitude) - class_of_interest (int): The single class to eval - - Returns: - numpy array: scores for class of interest at each location - """ - def encode_loc(latitude, longitude, elevation): - latitude = np.array(latitude) - longitude = np.array(longitude) - elevation = np.array(elevation) - elevation = elevation.astype("float32") - grid_lon = longitude.astype('float32') / 180.0 - grid_lat = latitude.astype('float32') / 90.0 - - elevation[elevation>0] = elevation[elevation>0]/6574.0 - elevation[elevation<0] = elevation[elevation<0]/32768.0 - norm_elev = elevation - - if np.isscalar(grid_lon): - grid_lon = np.array([grid_lon]) - if np.isscalar(grid_lat): - grid_lat = np.array([grid_lat]) - if np.isscalar(norm_elev): - norm_elev = np.array([norm_elev]) - - norm_loc = tf.stack([grid_lon, grid_lat], axis=1) - - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi), - tf.expand_dims(norm_elev, axis=1), - - ], axis=1) - - return encoded_loc - - encoded_loc = encode_loc(latitude, longitude, elevation) - loc_emb = self.gpmodel.layers[0](encoded_loc) - - # res layers - feature extraction - x = self.gpmodel.layers[1](loc_emb) - x = self.gpmodel.layers[2](x) - x = self.gpmodel.layers[3](x) - x = self.gpmodel.layers[4](x) - - # process just the one class - return x - - def eval_one_class_elevation_from_features(self, x, class_of_interest): - return tf.keras.activations.sigmoid( - tf.matmul( - x, - tf.expand_dims(self.gpmodel.layers[5].weights[0][:,class_of_interest], axis=0), - transpose_b=True - ) - ).numpy() def ignore_shapely_deprecation_warning(message, category, filename, lineno, file=None, line=None): if "array interface is deprecated" in str(message): return None return warnings.defaultaction(message, category, filename, lineno, file, line) + def main(args): print("loading in the model...") - mt = ModelTaxonomy(args.taxonomy) - tfgpm = TFGeoPriorModelEnv(args.model, mt) - + mtd = ModelTaxonomyDataframe(args.taxonomy, None) + tfgpm = TFGeoPriorModelElev(args.model) + print("setting up the map...") warnings.showwarning = ignore_shapely_deprecation_warning im = tifffile.imread(args.elevation) @@ -229,13 +40,13 @@ def main(args): im_df.columns = ["lat", "lng", "elevation"] elev_dfh3 = im_df.h3.geo_to_h3(args.h3_resolution) elev_dfh3 = elev_dfh3.drop( - columns=['lng', 'lat'] - ).groupby("h3_0"+str(args.h3_resolution)).mean() + columns=["lng", "lat"] + ).groupby("h3_0" + str(args.h3_resolution)).mean() gdfk = elev_dfh3.h3.h3_to_geo() gdfk["lng"] = gdfk["geometry"].x gdfk["lat"] = gdfk["geometry"].y _ = gdfk.pop("geometry") - gdfk = gdfk.rename_axis('h3index') + gdfk = gdfk.rename_axis("h3index") print("making features...") feats = tfgpm.features_for_one_class_elevation( @@ -245,13 +56,20 @@ def main(args): ) print("loading in the training data...") - train_df = pd.read_csv(args.train_spatial_data, - usecols=["taxon_id","latitude","longitude","captive"]).rename({ + train_df = pd.read_csv( + args.train_spatial_data, + usecols=[ + "taxon_id", + "latitude", + "longitude", + "captive" + ] + ).rename({ "latitude": "lat", "longitude": "lng" }, axis=1) - train_df = train_df[train_df.captive==0] #no-CID ok, wild only - train_df.drop(["captive"],axis=1) + train_df = train_df[train_df.captive == 0] # no-CID ok, wild only + train_df.drop(["captive"], axis=1) train_df_h3 = train_df.h3.geo_to_h3(args.h3_resolution) all_spatial_grid_counts = train_df_h3.index.value_counts() presence_absence = pd.DataFrame({ @@ -261,89 +79,103 @@ def main(args): print("...looping through taxa") output = [] - taxa = pd.read_csv(args.taxonomy, usecols=["taxon_id","leaf_class_id","iconic_class_id"]).dropna(subset=['leaf_class_id']) + taxa = pd.read_csv( + args.taxonomy, + usecols=[ + "taxon_id", + "leaf_class_id", + "iconic_class_id" + ] + ).dropna(subset=["leaf_class_id"]) taxon_ids = taxa.taxon_id if args.stop_after is not None: - taxon_ids = taxon_ids[0:args.stop_after] - desired_recall = 0.95 + taxon_ids = taxon_ids[0:args.stop_after] resolution = args.h3_resolution area = h3.hex_area(resolution) for taxon_id in tqdm(taxon_ids): try: - class_of_interest = mt.node_key_to_leaf_class_id[taxon_id] - except: - print('not in the model for some reason') + class_of_interest = mtd.df.loc[taxon_id]["leaf_class_id"] + except Exception: + print("not in the model for some reason") continue - #get predictions + # get predictions preds = tfgpm.eval_one_class_elevation_from_features(feats, class_of_interest) gdfk["pred"] = tf.squeeze(preds).numpy() - - #make presence absence dataset - target_spatial_grid_counts = train_df_h3[train_df_h3.taxon_id==taxon_id].index.value_counts() + + # make presence absence dataset + target_spatial_grid_counts = \ + train_df_h3[train_df_h3.taxon_id == taxon_id].index.value_counts() presences = gdfk.loc[target_spatial_grid_counts.index]["pred"] if len(presences) == 0: print("not present") continue - - #calculate threhold + + # calculate threhold presence_absence["forground"] = target_spatial_grid_counts presence_absence["predictions"] = gdfk["pred"] presence_absence.forground = presence_absence.forground.fillna(0) - yield_cutoff = np.percentile((presence_absence["background"]/presence_absence["forground"])[presence_absence["forground"]>0], 95) - absences = presence_absence[(presence_absence["forground"]==0) & (presence_absence["background"] > yield_cutoff)]["predictions"] - presences = presence_absence[(presence_absence["forground"]>0)]["predictions"] - df_x = pd.DataFrame({'predictions': presences, 'test': 1}) - df_y = pd.DataFrame({'predictions': absences, 'test': 0}) + yield_cutoff = np.percentile(( + presence_absence["background"] / presence_absence["forground"] + )[presence_absence["forground"] > 0], 95) + absences = presence_absence[ + (presence_absence["forground"] == 0) & (presence_absence["background"] > yield_cutoff) + ]["predictions"] + presences = presence_absence[(presence_absence["forground"] > 0)]["predictions"] + df_x = pd.DataFrame({"predictions": presences, "test": 1}) + df_y = pd.DataFrame({"predictions": absences, "test": 0}) for_thres = pd.concat([df_x, df_y], ignore_index=False) - precision, recall, thresholds = precision_recall_curve(for_thres.test, for_thres.predictions) + precision, recall, thresholds = precision_recall_curve( + for_thres.test, + for_thres.predictions + ) p1 = (2 * precision * recall) p2 = (precision + recall) - out = np.zeros( (len(p1)) ) - fscore = np.divide(p1,p2, out=out, where=p2!=0) + out = np.zeros((len(p1))) + fscore = np.divide(p1, p2, out=out, where=p2 != 0) index = np.argmax(fscore) thres = thresholds[index] - - #store daa + + # store daa row = { "taxon_id": taxon_id, "thres": thres, - "area": len(gdfk[gdfk.pred >= thres])*area + "area": len(gdfk[gdfk.pred >= thres]) * area } row_dict = dict(row) output.append(row_dict) - + print("writing output...") output_pd = pd.DataFrame(output) - output_pd.to_csv(args.output_dir+"/thresholds.csv") + output_pd.to_csv(args.output_dir + "/thresholds.csv") + if __name__ == "__main__": - - info_str = '\nrun as follows\n' + \ - ' python generate_thresholds.py --elevation wc2.1_5m_elev.tif \n' + \ - ' --model v2_6/tf_geoprior_2_5_r6_elevation.h5 \n' + \ - ' --taxonomy taxonomy_1_4.csv\n' + \ - ' --train_spatial_data v2_6/taxonomy.csv\n' + \ - ' --output_dir v2_6\n' + \ - ' --h3_resolution 4\n' + \ - ' --stop_after 10\n' - + info_str = "\nrun as follows\n" + \ + " python generate_thresholds.py --elevation wc2.1_5m_elev.tif \n" + \ + " --model v2_6/tf_geoprior_2_5_r6_elevation.h5 \n" + \ + " --taxonomy taxonomy_1_4.csv\n" + \ + " --train_spatial_data v2_6/taxonomy.csv\n" + \ + " --output_dir v2_6\n" + \ + " --h3_resolution 4\n" + \ + " --stop_after 10\n" + parser = argparse.ArgumentParser(usage=info_str) - parser.add_argument('--elevation', type=str, - help='Path to elev tif.', required=True) - parser.add_argument('--model', type=str, - help='Path to tf model.', required=True) - parser.add_argument('--taxonomy', type=str, - help='Path to taxonomy csv.', required=True) - parser.add_argument('--train_spatial_data', type=str, - help='Path to train csv for occupancy.', required=True) - parser.add_argument('--output_dir', type=str, - help='directory to write thesholds.', required=True) - parser.add_argument('--h3_resolution', type=int, default=4, - help='grid resolution from 0 - 15, lower numbers are coarser/faster. Currently using 4') - parser.add_argument('--stop_after', type=int, - help='just run the first x taxa') + parser.add_argument("--elevation", type=str, + help="Path to elev tif.", required=True) + parser.add_argument("--model", type=str, + help="Path to tf model.", required=True) + parser.add_argument("--taxonomy", type=str, + help="Path to taxonomy csv.", required=True) + parser.add_argument("--train_spatial_data", type=str, + help="Path to train csv for occupancy.", required=True) + parser.add_argument("--output_dir", type=str, + help="directory to write thesholds.", required=True) + parser.add_argument("--h3_resolution", type=int, default=4, + help="grid resolution from 0 - 15, lower numbers are coarser/faster. " + "Currently using 4") + parser.add_argument("--stop_after", type=int, + help="just run the first x taxa") args = parser.parse_args() main(args) - \ No newline at end of file diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index 2ae5e92..26e8d57 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -52,7 +52,9 @@ def setup_elevation_dataframe(self, config): if "elevation_h3_r4" in config: self.geo_elevation_cells = pd.read_csv(config["elevation_h3_r4"]). \ sort_values("h3_04").set_index("h3_04").sort_index() - self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe(self.geo_elevation_cells) + self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe( + self.geo_elevation_cells + ) def setup_elevation_dataframe_from_worldclim(self, config, resolution): # preventing from processing at too high a resolution @@ -67,7 +69,7 @@ def setup_elevation_dataframe_from_worldclim(self, config, resolution): im_df = im_df.melt(id_vars=["index"]) im_df.columns = ["lat", "lng", "elevation"] elev_dfh3 = im_df.h3.geo_to_h3(resolution) - elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f'h3_0{resolution}').mean() + elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f"h3_0{resolution}").mean() def setup_geo_model(self, config): self.geo_elevation_model = None @@ -112,7 +114,7 @@ def lookup_taxon(self, taxon_id): try: return self.taxonomy.df.loc[taxon_id] except Exception as e: - print(f'taxon `{taxon_id}` does not exist in the taxonomy') + print(f"taxon `{taxon_id}` does not exist in the taxonomy") raise e def predictions_for_image(self, file_path, lat, lng, filter_taxon, score_without_geo=False, @@ -154,7 +156,8 @@ def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, ) # normalize the vision scores so they add up to 1 after filtering sum_of_vision_scores = leaf_scores["vision_score"].sum() - leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] / sum_of_vision_scores + leaf_scores["normalized_vision_score"] = \ + leaf_scores["vision_score"] / sum_of_vision_scores else: # when not filtering by a taxon, the normalized vision score is the same as the original leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] @@ -182,8 +185,8 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, # using nested set left and right values, select the filter taxon, # its descendants, and its ancestors all_node_scores = self.taxonomy.df.query( - f'(left >= {filter_taxon["left"]} and right <= {filter_taxon["right"]}) or' + - f'(left < {filter_taxon["left"]} and right > {filter_taxon["right"]})' + f"(left >= {filter_taxon['left']} and right <= {filter_taxon['right']}) or" + f"(left < {filter_taxon['left']} and right > {filter_taxon['right']})" ).copy().reset_index(drop=True) else: all_node_scores = self.taxonomy.df.copy().reset_index(drop=True) @@ -204,7 +207,7 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, aggregated_scores = {} # restrict score aggregation to results where the combined score is above the cutoff - scores_to_aggregate = leaf_scores.query(f'combined_score > {cutoff}') + scores_to_aggregate = leaf_scores.query(f"combined_score > {cutoff}") # loop through all results where the combined score is above the cutoff for taxon_id, vision_score, geo_score, geo_threshold in zip( scores_to_aggregate["taxon_id"], @@ -223,13 +226,14 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 100 # aggregated vision score is a sum of descendant scores aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] += vision_score - if not no_geo_scores and geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: + if not no_geo_scores and \ + geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: # aggregated geo score is the max of descendant geo scores aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = geo_score if not no_geo_scores and \ aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] != 0 and \ geo_score > geo_threshold: - # aggregated geo threshold is set to 0 if any descendants are above their threshold + # aggregated threshold is set to 0 if any descendants are above their threshold aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 0 # turn the aggregated_scores dict into a data frame @@ -252,12 +256,13 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, if (no_geo_scores or score_without_geo): # if there are no geo scores, or it was requested to not use geo scores to affect # the final combined score, set the combined scores to be the same as the vision scores - all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] + all_node_scores["aggregated_combined_score"] = \ + all_node_scores["aggregated_vision_score"] else: # the combined score is simply the normalized vision score # multipliedby the normalized geo score - all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] * \ - all_node_scores["aggregated_geo_score"] + all_node_scores["aggregated_combined_score"] = \ + all_node_scores["aggregated_vision_score"] * all_node_scores["aggregated_geo_score"] # calculate a normalized combined score so all values add to 1, to be used for thresholding sum_of_root_node_aggregated_combined_scores = all_node_scores.query( @@ -267,26 +272,30 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, if debug: print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.)) - thresholded_results = all_node_scores.query("normalized_aggregated_combined_score > 0.05") + thresholded_results = all_node_scores.query( + "normalized_aggregated_combined_score > 0.05" + ) print("\nTree of aggregated results:") ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( - lambda row: f'{row.name} [' + - f'V:{round(row.aggregated_vision_score, 4)}, ' + - f'G:{round(row.aggregated_geo_score, 4)}, ' + - f'C:{round(row.aggregated_combined_score, 4)}, ' + - f'NC:{round(row.normalized_aggregated_combined_score, 4)}]')) + lambda row: f"{row.name} [" + f"V:{round(row.aggregated_vision_score, 4)}, " + f"G:{round(row.aggregated_geo_score, 4)}, " + f"C:{round(row.aggregated_combined_score, 4)}, " + f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" + )) print("") return all_node_scores - def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, raw_results=False): + def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], + thresholded=False, raw_results=False): if (self.geo_elevation_cells is None) or (self.geo_elevation_model is None): return try: taxon = self.taxonomy.df.loc[taxon_id] except Exception as e: - print(f'taxon `{taxon_id}` does not exist in the taxonomy') + print(f"taxon `{taxon_id}` does not exist in the taxonomy") raise e - if math.isnan(taxon["leaf_class_id"]): + if pd.isna(taxon["leaf_class_id"]): return geo_scores = self.geo_elevation_model.eval_one_class_elevation_from_features( @@ -300,7 +309,7 @@ def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, ra # is smaller. This reduces data needed to be redendered client-side for the Data Layer # mapping approach, and maybe can be removed once switching to map tiles lower_bound_score = np.array([0.0001, taxon["geo_threshold"] / 10]).min() - geo_score_cells = geo_score_cells.query(f'geo_score > {lower_bound_score}') + geo_score_cells = geo_score_cells.query(f"geo_score > {lower_bound_score}") if bounds: min = geo_score_cells["geo_score"].min() @@ -316,7 +325,7 @@ def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, ra return dict(zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"])) def h3_04_taxon_range(self, taxon_id, bounds=[]): - taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f'{taxon_id}.csv') + taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f"{taxon_id}.csv") if not os.path.exists(taxon_range_path): return None taxon_range_df = pd.read_csv(taxon_range_path, names=["h3_04"], header=None). \ @@ -328,7 +337,9 @@ def h3_04_taxon_range(self, taxon_id, bounds=[]): return dict(zip(taxon_range_df.index.astype(str), taxon_range_df["value"])) def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): - geomodel_results = self.h3_04_geo_results_for_taxon(taxon_id, bounds, thresholded=True) or {} + geomodel_results = self.h3_04_geo_results_for_taxon( + taxon_id, bounds, thresholded=True + ) or {} taxon_range_results = self.h3_04_taxon_range(taxon_id, bounds) or {} combined_results = {} for cell_key in geomodel_results: @@ -394,7 +405,7 @@ def filter_geo_dataframe_by_bounds(geo_df, bounds): # query for cells wtihin the buffered bounds, and potentially # on the other side of the antimeridian - query = f'lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and ' + \ - f' ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})' + \ - f' {antimedirian_condition})' + query = f"lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and " + \ + f" ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})" + \ + f" {antimedirian_condition})" return geo_df.query(query) diff --git a/lib/inat_vision_api.py b/lib/inat_vision_api.py index 8703c43..42fd8b0 100644 --- a/lib/inat_vision_api.py +++ b/lib/inat_vision_api.py @@ -57,7 +57,7 @@ def h3_04_default_route(self, h3_04_method): else: results_dict = h3_04_method(taxon_id, bounds) if results_dict is None: - return f'Unknown taxon_id {taxon_id}', 422 + return f"Unknown taxon_id {taxon_id}", 422 return InatVisionAPI.round_floats(results_dict, 8) def h3_04_bounds_route(self): @@ -67,7 +67,7 @@ def h3_04_bounds_route(self): results_dict = self.inferrer.h3_04_bounds(taxon_id) if results_dict is None: - return f'Unknown taxon_id {taxon_id}', 422 + return f"Unknown taxon_id {taxon_id}", 422 return results_dict def index_route(self): @@ -133,11 +133,11 @@ def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): "aggregated_geo_threshold": "geo_threshold" } - no_geo_scores = (leaf_scores["geo_score"].max() == 0) - # set a cutoff where branches whose combined scores are below the threshold are ignored # TODO: this threshold is completely arbitrary and needs testing - aggregated_results = aggregated_results.query("normalized_aggregated_combined_score > 0.05") + aggregated_results = aggregated_results.query( + "normalized_aggregated_combined_score > 0.05" + ) # after setting a cutoff, get the parent IDs of the remaining taxa parent_taxon_ids = aggregated_results["parent_taxon_id"].values # noqa: F841 @@ -145,20 +145,30 @@ def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): # taxa who are not parents of any remaining taxa leaf_results = aggregated_results.query("taxon_id not in @parent_taxon_ids") - leaf_results = leaf_results.sort_values("aggregated_combined_score", ascending=False).head(100) + leaf_results = leaf_results.sort_values( + "aggregated_combined_score", + ascending=False + ).head(100) score_columns = ["aggregated_combined_score", "aggregated_geo_score", "aggregated_vision_score", "aggregated_geo_threshold"] leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index") final_results = leaf_results[columns_to_return].rename(columns=column_mapping) else: - no_geo_scores = (leaf_scores["geo_score"].max() == 0) - top_combined_score = leaf_scores.sort_values("combined_score", ascending=False).head(1)["combined_score"].values[0] + top_combined_score = leaf_scores.sort_values( + "combined_score", + ascending=False + ).head(1)["combined_score"].values[0] # set a cutoff so results whose combined scores are # much lower than the best score are not returned - leaf_scores = leaf_scores.query(f'combined_score > {top_combined_score * 0.001}') + leaf_scores = leaf_scores.query(f"combined_score > {top_combined_score * 0.001}") top100 = leaf_scores.sort_values("combined_score", ascending=False).head(100) - score_columns = ["combined_score", "geo_score", "normalized_vision_score", "geo_threshold"] + score_columns = [ + "combined_score", + "geo_score", + "normalized_vision_score", + "geo_threshold" + ] top100[score_columns] = top100[score_columns].multiply(100, axis="index") # legacy dict response @@ -222,7 +232,7 @@ def valid_leaf_taxon_id_for_request(self, request): taxon_id = int(taxon_id) if float(taxon_id) not in self.inferrer.taxonomy.leaf_df["taxon_id"].values: - return None, f'Unknown taxon_id {taxon_id}', 422 + return None, f"Unknown taxon_id {taxon_id}", 422 return taxon_id, None, None def valid_bounds_for_request(self, request): @@ -242,12 +252,12 @@ def valid_bounds_for_request(self, request): def write_logstash(image_uuid, file_path, request_start_datetime, request_start_time): request_end_time = time.time() request_time = round((request_end_time - request_start_time) * 1000, 6) - logstash_log = open('log/logstash.log', 'a') - log_data = {'@timestamp': request_start_datetime.isoformat(), - 'uuid': image_uuid, - 'duration': request_time, - 'client_ip': request.access_route[0], - 'image_size': os.path.getsize(file_path)} + logstash_log = open("log/logstash.log", "a") + log_data = {"@timestamp": request_start_datetime.isoformat(), + "uuid": image_uuid, + "duration": request_time, + "client_ip": request.access_route[0], + "image_size": os.path.getsize(file_path)} json.dump(log_data, logstash_log) logstash_log.write("\n") logstash_log.close() diff --git a/lib/model_taxonomy.py b/lib/model_taxonomy.py deleted file mode 100644 index d390516..0000000 --- a/lib/model_taxonomy.py +++ /dev/null @@ -1,73 +0,0 @@ -import csv -from lib.taxon import Taxon - - -class ModelTaxonomy: - - def __init__(self, path): - self.load_mapping(path) - self.assign_nested_values() - - def load_mapping(self, path): - self.node_key_to_leaf_class_id = {} - self.leaf_class_to_taxon = {} - # there is no taxon with ID 0, but roots of the taxonomy have a parent ID of 0, - # so create a fake taxon of Life to represent the root of the entire tree - self.taxa = {0: Taxon({"name": "Life", "depth": 0})} - self.taxon_children = {} - try: - with open(path) as csv_file: - csv_reader = csv.DictReader(csv_file, delimiter=",") - for row in csv_reader: - taxon_id = int(row["taxon_id"]) - rank_level = float(row["rank_level"]) - leaf_class_id = int(row["leaf_class_id"]) if row["leaf_class_id"] else None - parent_id = int(row["parent_taxon_id"]) if row["parent_taxon_id"] else 0 - # some taxa are not leaves and aren't represented in the leaf layer - if leaf_class_id is not None: - self.node_key_to_leaf_class_id[taxon_id] = leaf_class_id - self.leaf_class_to_taxon[leaf_class_id] = taxon_id - self.taxa[taxon_id] = Taxon({ - "id": taxon_id, - "name": row["name"], - "parent_id": parent_id, - "leaf_class_id": leaf_class_id, - "rank_level": rank_level - }) - if parent_id not in self.taxon_children: - self.taxon_children[parent_id] = [] - self.taxon_children[parent_id].append(taxon_id) - except IOError as e: - print(e) - print(f"\n\nCannot open mapping file `{path}`\n\n") - raise e - - # prints to the console a representation of this tree - def print(self, taxon_id=0, ancestor_prefix=""): - children = self.taxon_children[taxon_id] - index = 0 - for child_id in children: - last_in_branch = (index == len(children) - 1) - index += 1 - icon = "└──" if last_in_branch else "├──" - prefixIcon = " " if last_in_branch else "│ " - taxon = self.taxa[child_id] - print(f'{ancestor_prefix}{icon}{taxon.name} :: {taxon.left}:{taxon.right}') - if child_id in self.taxon_children: - self.print(child_id, f"{ancestor_prefix}{prefixIcon}") - - # calculated nested set left and right values and depth representing how many nodes - # down the taxon is from Life. These can be later used for an efficient way to calculate - # if a taxon is a descendant of another - def assign_nested_values(self, taxon_id=0, index=0, depth=1, ancestors=[]): - for child_id in self.taxon_children[taxon_id]: - self.taxa[child_id].set("left", index) - self.taxa[child_id].set("depth", depth) - self.taxa[child_id].set("ancestors", ancestors) - index += 1 - if child_id in self.taxon_children: - child_ancestors = ancestors + [child_id] - index = self.assign_nested_values(child_id, index, depth + 1, child_ancestors) - self.taxa[child_id].set("right", index) - index += 1 - return index diff --git a/lib/model_taxonomy_dataframe.py b/lib/model_taxonomy_dataframe.py index a707c7d..4264d1f 100644 --- a/lib/model_taxonomy_dataframe.py +++ b/lib/model_taxonomy_dataframe.py @@ -1,4 +1,3 @@ -import math import pandas as pd @@ -8,7 +7,27 @@ def __init__(self, path, thresholds_path): self.load_mapping(path, thresholds_path) def load_mapping(self, path, thresholds_path): - self.df = pd.read_csv(path) + self.df = pd.read_csv( + path, + usecols=[ + "parent_taxon_id", + "taxon_id", + "rank_level", + "leaf_class_id", + "iconic_class_id", + "spatial_class_id", + "name" + ], + dtype={ + "parent_taxon_id": "Int64", + "taxon_id": int, + "rank_level": float, + "leaf_class_id": "Int64", + "iconic_class_id": "Int64", + "spatial_class_id": "Int64", + "name": pd.StringDtype() + } + ) # left and right will be used to store nested set indices self.df["left"] = pd.Series([], dtype=object) self.df["right"] = pd.Series([], dtype=object) @@ -17,7 +36,7 @@ def load_mapping(self, path, thresholds_path): self.taxon_ancestors = {} for index, taxon in self.df.iterrows(): self.taxon_row_mapping[taxon["taxon_id"]] = index - parent_id = 0 if math.isnan(taxon["parent_taxon_id"]) else int(taxon["parent_taxon_id"]) + parent_id = 0 if pd.isna(taxon["parent_taxon_id"]) else int(taxon["parent_taxon_id"]) if parent_id not in self.taxon_children: self.taxon_children[parent_id] = [] self.taxon_children[parent_id].append(taxon["taxon_id"]) @@ -50,7 +69,7 @@ def assign_nested_values(self, taxon_id=0, index=0, ancestor_taxon_ids=[]): def children(df, taxon_id): if taxon_id == 0: return df.query("parent_taxon_id.isnull()") - return df.query(f'parent_taxon_id == {taxon_id}') + return df.query(f"parent_taxon_id == {taxon_id}") @staticmethod def print(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): @@ -65,10 +84,15 @@ def print(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): index += 1 icon = "└──" if last_in_branch else "├──" prefixIcon = " " if last_in_branch else "│ " - print(f'{ancestor_prefix}{icon}', end="") + print(f"{ancestor_prefix}{icon}", end="") if display_taxon_lambda is None: - print(f'{row.name} :: {row.left}:{row.right}') + print(f"{row.name} :: {row.left}:{row.right}") else: print(display_taxon_lambda(row)) if row.right != row.left + 1: - ModelTaxonomyDataframe.print(df, row.taxon_id, f"{ancestor_prefix}{prefixIcon}", display_taxon_lambda) + ModelTaxonomyDataframe.print( + df, + row.taxon_id, + f"{ancestor_prefix}{prefixIcon}", + display_taxon_lambda + ) diff --git a/lib/model_test_data_export_manager.py b/lib/model_test_data_export_manager.py index 4c3ee2e..3aa35f9 100644 --- a/lib/model_test_data_export_manager.py +++ b/lib/model_test_data_export_manager.py @@ -21,9 +21,9 @@ def load_train_data_photo_ids(self): def export_path(self, filename_addition): currentDatetime = datetime.now() timestamp = currentDatetime.strftime("%Y%m%d") - export_path = f'test-obs-{timestamp}' + export_path = f"test-obs-{timestamp}" if filename_addition: - export_path += f'-{filename_addition}' + export_path += f"-{filename_addition}" if "filename_suffix" in self.cmd_args and self.cmd_args["filename_suffix"]: export_path += "-" + self.cmd_args["filename_suffix"] export_path += ".csv" @@ -38,7 +38,7 @@ async def generate_from_cmd_args(self): parameters_string = None if api_parameters: - parameters_string = "-".join(map(lambda key: f'{key}-{api_parameters[key]}', + parameters_string = "-".join(map(lambda key: f"{key}-{api_parameters[key]}", api_parameters)) export_path = self.export_path(parameters_string) exporter = ModelTestDataExporter( diff --git a/lib/model_test_data_exporter.py b/lib/model_test_data_exporter.py index 5013bd7..f0b32c0 100644 --- a/lib/model_test_data_exporter.py +++ b/lib/model_test_data_exporter.py @@ -95,7 +95,7 @@ async def fetch_more_data(self): min_pages_remaining = math.ceil( (self.max_results / ModelTestDataExporter.API_REQUEST_PER_PAGE) ) - print(f'Queueing {min_pages_remaining} workers') + print(f"Queueing {min_pages_remaining} workers") for i in range(min_pages_remaining): await self.queue.put(i) await self.queue.join() @@ -110,7 +110,7 @@ async def process_api_response(self): if self.finished(): return - print(f'Fetching more results... {self.rows_written} so far') + print(f"Fetching more results... {self.rows_written} so far") starting_rows_written = self.rows_written async with self.session.get(ModelTestDataExporter.API_BASE_URL, params=self.api_parameters) as response: @@ -158,7 +158,8 @@ def process_api_response_row(self, row): self.used_observations[row["uuid"]] = True return - if row["quality_grade"] == "casual" and not (row["community_taxon_id"] and row["community_taxon_id"] == row["taxon"]["id"]): + if row["quality_grade"] == "casual" \ + and not (row["community_taxon_id"] and row["community_taxon_id"] == row["taxon"]["id"]): self.used_observations[row["uuid"]] = True return diff --git a/lib/pt_geo_prior_model.py b/lib/pt_geo_prior_model.py index b81b4e9..77ea3b8 100644 --- a/lib/pt_geo_prior_model.py +++ b/lib/pt_geo_prior_model.py @@ -31,7 +31,7 @@ def predict(self, latitude, longitude, filter_taxon_id=None): try: filter_taxon = self.taxonomy.df.iloc[filter_taxon_id] except Exception as e: - print(f'filter_taxon `{filter_taxon_id}` does not exist in the taxonomy') + print(f"filter_taxon `{filter_taxon_id}` does not exist in the taxonomy") raise e location = np.array([longitude, latitude])[np.newaxis, ...] # we're not currently using date inference, so set default values for date diff --git a/lib/taxon.py b/lib/taxon.py deleted file mode 100644 index 63e169a..0000000 --- a/lib/taxon.py +++ /dev/null @@ -1,30 +0,0 @@ -# Taxon: -# parent_taxon_id -# taxon_id -# rank_level -# leaf_class_id -# iconic_class_id -# name -# left -# right -# depth - - -class Taxon: - - def __init__(self, row): - for key in row: - self.set(key, row[key]) - - def set(self, attr, val): - setattr(self, attr, val) - - def is_or_descendant_of(self, taxon): - if self.id == taxon.id: - return True - return self.descendant_of(taxon) - - # using the nested set left and right values, a taxon is a descendant of another - # as long as its left is higher and its right is lower - def descendant_of(self, taxon): - return self.left > taxon.left and self.right < taxon.right diff --git a/lib/tf_gp_elev_model.py b/lib/tf_gp_elev_model.py index ebaad95..caf3fad 100644 --- a/lib/tf_gp_elev_model.py +++ b/lib/tf_gp_elev_model.py @@ -11,13 +11,13 @@ class TFGeoPriorModelElev: def __init__(self, model_path): # initialize the geo model for inference - tf.config.set_visible_devices([], 'GPU') + tf.config.set_visible_devices([], "GPU") visible_devices = tf.config.get_visible_devices() for device in visible_devices: - assert device.device_type != 'GPU' + assert device.device_type != "GPU" self.gpmodel = tf.keras.models.load_model( model_path, - custom_objects={'ResLayer': ResLayer}, + custom_objects={"ResLayer": ResLayer}, compile=False ) diff --git a/lib/vision_inferrer.py b/lib/vision_inferrer.py index f1c7482..a81e8ee 100644 --- a/lib/vision_inferrer.py +++ b/lib/vision_inferrer.py @@ -10,10 +10,10 @@ def __init__(self, model_path): # initialize the TF model given the configured path def prepare_tf_model(self): # disable GPU processing - tf.config.set_visible_devices([], 'GPU') + tf.config.set_visible_devices([], "GPU") visible_devices = tf.config.get_visible_devices() for device in visible_devices: - assert device.device_type != 'GPU' + assert device.device_type != "GPU" self.vision_model = tf.keras.models.load_model(self.model_path, compile=False) diff --git a/lib/vision_testing.py b/lib/vision_testing.py index 73f21ce..2da6329 100644 --- a/lib/vision_testing.py +++ b/lib/vision_testing.py @@ -1,6 +1,4 @@ -import csv import os -import urllib import hashlib import magic import time @@ -34,7 +32,7 @@ def __init__(self, config, **args): print("Models:") for index, model_config in enumerate(config["models"]): print(json.dumps(model_config, indent=4)) - model_name = model_config["name"] if "name" in model_config else f'Model {index}' + model_name = model_config["name"] if "name" in model_config else f"Model {index}" model_config["name"] = model_name for score_type in score_types: self.scores[score_type]["vision"][index] = [] @@ -134,10 +132,16 @@ def print_scores(self): (sum(top5_distance_scores) / metrics["count"]) * 100, 2) metrics["top10∆"] = round( (sum(top10_distance_scores) / metrics["count"]) * 100, 2) - metrics["avg∆"] = round( - (mean(self.scores["average_ancestor_distance_scores"][method][index]) / metrics["count"]) * 100, 2) - metrics["sum∆"] = round( - (mean(self.scores["sum_ancestor_distance_scores"][method][index]) / metrics["count"]) * 100, 2) + metrics["avg∆"] = round(( + mean( + self.scores["average_ancestor_distance_scores"][method][index] + ) / metrics["count"] + ) * 100, 2) + metrics["sum∆"] = round(( + mean( + self.scores["sum_ancestor_distance_scores"][method][index] + ) / metrics["count"] + ) * 100, 2) all_metrics[method] = metrics print("method " + "\t" + "\t".join(all_metrics["vision"].keys())) @@ -147,16 +151,6 @@ def print_scores(self): str(value) for value in all_metrics[method].values())) print("\n") - # NOTE: this is assuming no conversion is needed. - # Ideally we'd reuse the inat_inferrer prepare_image_for_inference - def prepare_image_for_inference(self, cache_path): - image = tf.io.read_file(cache_path) - image = tf.image.decode_jpeg(image, channels=3) - image = tf.image.convert_image_dtype(image, tf.float32) - image = tf.image.central_crop(image, 0.875) - image = tf.image.resize(image, [299, 299], tf.image.ResizeMethod.NEAREST_NEIGHBOR) - return tf.expand_dims(image, 0) - def assess_top_results(self, observation, top_results): match_index = None distance_scores = [] @@ -194,7 +188,7 @@ async def test_observation_async(self, observation): cache_path = await self.download_photo_async(observation.photo_url) if cache_path is None or not os.path.exists(cache_path): return False - if observation.lat == '' or observation.lng == '': + if observation.lat == "" or observation.lng == "": return False iconic_taxon_id = None @@ -215,7 +209,7 @@ async def test_observation_async(self, observation): ) except Exception as e: print(e) - print(f'\nError scoring observation {observation.observation_id}') + print(f"\nError scoring observation {observation.observation_id}") return False return inferrer_scores @@ -237,7 +231,9 @@ def ancestor_distance_scores(self, observation, inferrer, results): if result_ancestor_match_index is None: result_ancestor_match_index = len(reversed_target_ancestors) # calculate a score of how far from species the result matched the target - ancestor_distance_scores.append((1 - (result_ancestor_match_index / len(reversed_target_ancestors)))**2) + ancestor_distance_scores.append((1 - ( + result_ancestor_match_index / len(reversed_target_ancestors) + ))**2) return ancestor_distance_scores def append_to_aggregate_results(self, observation, inferrer_scores): @@ -318,5 +314,8 @@ def debug(self, message): def report_progress(self): if self.processed_counter % 10 == 0: total_time = round(time.time() - self.start_time, 3) - remaining_time = round((self.limit - self.processed_counter) / (self.processed_counter / total_time), 3) - print(f'Processed {self.processed_counter} in {total_time} sec\testimated {remaining_time} sec remaining') + remaining_time = round(( + self.limit - self.processed_counter + ) / (self.processed_counter / total_time), 3) + print(f"Processed {self.processed_counter} in {total_time} sec\t" + f"estimated {remaining_time} sec remaining") diff --git a/requirements.txt b/requirements.txt index a4d3300..8fc83f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ coverage flake8 +flake8-quotes Flask Flask-WTF h3 @@ -11,6 +12,7 @@ matplotlib numpy pandas Pillow +pipenv prison pytest pytest-cov diff --git a/taxon_range_evaluation.py b/taxon_range_evaluation.py index 25e87ec..1d00336 100644 --- a/taxon_range_evaluation.py +++ b/taxon_range_evaluation.py @@ -2,8 +2,8 @@ Script to evaluate model and thresholds against taxon ranges """ + import argparse -import csv import tensorflow as tf import pandas as pd import gc @@ -12,284 +12,44 @@ import tifffile import numpy as np import h3 -import h3pandas -import math +import h3pandas # noqa: F401 import geopandas as gpd +import matplotlib.pyplot as plt from sklearn.metrics import auc from sklearn.metrics import precision_recall_curve +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe +from lib.tf_gp_elev_model import TFGeoPriorModelElev -class ResLayer(tf.keras.layers.Layer): - def __init__(self): - super(ResLayer, self).__init__() - self.w1 = tf.keras.layers.Dense( - 256, activation="relu", kernel_initializer="he_normal" - ) - self.w2 = tf.keras.layers.Dense( - 256, activation="relu", kernel_initializer="he_normal" - ) - self.dropout = tf.keras.layers.Dropout(rate=0.5) - self.add = tf.keras.layers.Add() - - def call(self, inputs): - x = self.w1(inputs) - x = self.dropout(x) - x = self.w2(x) - x = self.add([x, inputs]) - return x - - def get_config(self): - return {} - -class Taxon: - - def __init__(self, row): - for key in row: - setattr(self, key, row[key]) - - def set(self, attr, val): - setattr(self, attr, val) - - def is_or_descendant_of(self, taxon): - if self.id == taxon.id: - return True - return self.descendant_of(taxon) - - # using the nested set left and right values, a taxon is a descendant of another - # as long as its left is higher and its right is lower - def descendant_of(self, taxon): - return self.left > taxon.left and self.right < taxon.right - -class ModelTaxonomy: - - def __init__(self, path): - self.load_mapping(path) - self.assign_nested_values() - - def load_mapping(self, path): - self.node_key_to_leaf_class_id = {} - self.leaf_class_to_taxon = {} - # there is no taxon with ID 0, but roots of the taxonomy with have a parent ID of 0, - # so create a fake taxon of Life to represent the root of the entire tree - self.taxa = {0: Taxon({"name": "Life", "depth": 0})} - self.taxon_children = {} - try: - with open(path) as csv_file: - csv_reader = csv.DictReader(csv_file, delimiter=",") - for row in csv_reader: - taxon_id = int(row["taxon_id"]) - rank_level = float(row["rank_level"]) - leaf_class_id = int(row["leaf_class_id"]) if row["leaf_class_id"] else None - parent_id = int(row["parent_taxon_id"]) if row["parent_taxon_id"] else 0 - # some taxa are not leaves and aren't represented in the leaf layer - if leaf_class_id is not None: - self.node_key_to_leaf_class_id[taxon_id] = leaf_class_id - self.leaf_class_to_taxon[leaf_class_id] = taxon_id - self.taxa[taxon_id] = Taxon({ - "id": taxon_id, - "name": row["name"], - "parent_id": parent_id, - "leaf_class_id": leaf_class_id, - "rank_level": rank_level - }) - if parent_id not in self.taxon_children: - self.taxon_children[parent_id] = [] - self.taxon_children[parent_id].append(taxon_id) - except IOError as e: - print(e) - print(f"\n\nCannot open mapping file `{path}`\n\n") - raise e - - # prints to the console a representation of this tree - def print(self, taxon_id=0, ancestor_prefix=""): - children = self.taxon_children[taxon_id] - index = 0 - for child_id in children: - last_in_branch = (index == len(children) - 1) - index += 1 - icon = "└──" if last_in_branch else "├──" - prefixIcon = " " if last_in_branch else "│ " - taxon = self.taxa[child_id] - print(f'{ancestor_prefix}{icon}{taxon.name} :: {taxon.left}:{taxon.right}') - if child_id in self.taxon_children: - self.print(child_id, f"{ancestor_prefix}{prefixIcon}") - - # calculated nested set left and right values and depth representing how many nodes - # down the taxon is from Life. These can be later used for an efficient way to calculate - # if a taxon is a descendant of another - def assign_nested_values(self, taxon_id=0, index=0, depth=1, ancestors=[]): - for child_id in self.taxon_children[taxon_id]: - self.taxa[child_id].set("left", index) - self.taxa[child_id].set("depth", depth) - self.taxa[child_id].set("ancestors", ancestors) - index += 1 - if child_id in self.taxon_children: - child_ancestors = ancestors + [child_id] - index = self.assign_nested_values(child_id, index, depth + 1, child_ancestors) - self.taxa[child_id].set("right", index) - index += 1 - return index - -class TFGeoPriorModelEnv: - - def __init__(self, model_path, taxonomy): - self.taxonomy = taxonomy - # initialize the geo model for inference - self.gpmodel = tf.keras.models.load_model( - model_path, - custom_objects={'ResLayer': ResLayer}, - compile=False - ) - - def eval_one_class_elevation(self, latitude, longitude, elevation, class_of_interest): - """Evalutes the model for a single class and multiple locations - - Args: - latitude (list): A list of latitudes - longitude (list): A list of longitudes (same length as latitude) - elevation (list): A list of elevations (same length as latitude) - class_of_interest (int): The single class to eval - - Returns: - numpy array: scores for class of interest at each location - """ - def encode_loc(latitude, longitude, elevation): - latitude = np.array(latitude) - longitude = np.array(longitude) - elevation = np.array(elevation) - elevation = elevation.astype("float32") - grid_lon = longitude.astype('float32') / 180.0 - grid_lat = latitude.astype('float32') / 90.0 - - elevation[elevation>0] = elevation[elevation>0]/6574.0 - elevation[elevation<0] = elevation[elevation<0]/32768.0 - norm_elev = elevation - - if np.isscalar(grid_lon): - grid_lon = np.array([grid_lon]) - if np.isscalar(grid_lat): - grid_lat = np.array([grid_lat]) - if np.isscalar(norm_elev): - norm_elev = np.array([norm_elev]) - - norm_loc = tf.stack([grid_lon, grid_lat], axis=1) - - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi), - tf.expand_dims(norm_elev, axis=1), - - ], axis=1) - - return encoded_loc - - encoded_loc = encode_loc(latitude, longitude, elevation) - loc_emb = self.gpmodel.layers[0](encoded_loc) - - # res layers - feature extraction - x = self.gpmodel.layers[1](loc_emb) - x = self.gpmodel.layers[2](x) - x = self.gpmodel.layers[3](x) - x = self.gpmodel.layers[4](x) - - # process just the one class - return tf.keras.activations.sigmoid( - tf.matmul( - x, - tf.expand_dims(self.gpmodel.layers[5].weights[0][:,class_of_interest], axis=0), - transpose_b=True - ) - ).numpy() - - def features_for_one_class_elevation(self, latitude, longitude, elevation): - """Evalutes the model for a single class and multiple locations - - Args: - latitude (list): A list of latitudes - longitude (list): A list of longitudes (same length as latitude) - elevation (list): A list of elevations (same length as latitude) - class_of_interest (int): The single class to eval - - Returns: - numpy array: scores for class of interest at each location - """ - def encode_loc(latitude, longitude, elevation): - latitude = np.array(latitude) - longitude = np.array(longitude) - elevation = np.array(elevation) - elevation = elevation.astype("float32") - grid_lon = longitude.astype('float32') / 180.0 - grid_lat = latitude.astype('float32') / 90.0 - - elevation[elevation>0] = elevation[elevation>0]/6574.0 - elevation[elevation<0] = elevation[elevation<0]/32768.0 - norm_elev = elevation - - if np.isscalar(grid_lon): - grid_lon = np.array([grid_lon]) - if np.isscalar(grid_lat): - grid_lat = np.array([grid_lat]) - if np.isscalar(norm_elev): - norm_elev = np.array([norm_elev]) - - norm_loc = tf.stack([grid_lon, grid_lat], axis=1) - - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi), - tf.expand_dims(norm_elev, axis=1), - - ], axis=1) - - return encoded_loc - - encoded_loc = encode_loc(latitude, longitude, elevation) - loc_emb = self.gpmodel.layers[0](encoded_loc) - - # res layers - feature extraction - x = self.gpmodel.layers[1](loc_emb) - x = self.gpmodel.layers[2](x) - x = self.gpmodel.layers[3](x) - x = self.gpmodel.layers[4](x) - - # process just the one class - return x - - def eval_one_class_elevation_from_features(self, x, class_of_interest): - return tf.keras.activations.sigmoid( - tf.matmul( - x, - tf.expand_dims(self.gpmodel.layers[5].weights[0][:,class_of_interest], axis=0), - transpose_b=True - ) - ).numpy() def evaluate_p_r(thres, gdfb, tr_h3, world, plot): - bp_h3 = gdfb[gdfb["pred"]>=thres].copy() + bp_h3 = gdfb[gdfb["pred"] >= thres].copy() area = bp_h3.shape[0] if area == 0: return None, None, None - tt = tr_h3.h3.h3_to_geo_boundary()[['geometry']].copy() - fp_map = bp_h3[~bp_h3.index.isin(tt.index)].h3.h3_to_geo_boundary()[['geometry']].copy() + tt = tr_h3.h3.h3_to_geo_boundary()[["geometry"]].copy() + fp_map = bp_h3[~bp_h3.index.isin(tt.index)].h3.h3_to_geo_boundary()[["geometry"]].copy() fp_map = fp_map.set_geometry(fp_map.geometry.apply(push_right)) fp_map["score"] = 1 - tp_map = tt[tt.index.isin(bp_h3.index)][['geometry']].copy() + tp_map = tt[tt.index.isin(bp_h3.index)][["geometry"]].copy() tp_map["score"] = 2 - fn_map = tt[~tt.index.isin(bp_h3.index)][['geometry']].copy() + fn_map = tt[~tt.index.isin(bp_h3.index)][["geometry"]].copy() fn_map["score"] = 3 kappa_map = pd.concat([fp_map, tp_map, fn_map], axis=0) - - fp=kappa_map[kappa_map["score"]==1].shape[0] #fp - tp=kappa_map[kappa_map["score"]==2].shape[0] #tp - fn=kappa_map[kappa_map["score"]==3].shape[0] #fn - p = tp/(tp+fp) - r = tp/(fn+tp) - - if plot==True: + + fp = kappa_map[kappa_map["score"] == 1].shape[0] # fp + tp = kappa_map[kappa_map["score"] == 2].shape[0] # tp + fn = kappa_map[kappa_map["score"] == 3].shape[0] # fn + if tp + fp == 0 or fn + tp == 0: + return None, None, None + p = tp / (tp + fp) + r = tp / (fn + tp) + + if plot is True: print("Precision: " + str(p)) print("Recall: " + str(r)) kappa_map_geometry_total_bounds = kappa_map.geometry.total_bounds if np.isnan(kappa_map_geometry_total_bounds).any(): - minx, miny, maxx, maxy = [-180, -90, 180, 90] + minx, miny, maxx, maxy = [-180, -90, 180, 90] else: minx, miny, maxx, maxy = kappa_map_geometry_total_bounds fig, ax = plt.subplots(figsize=(10, 10)) @@ -302,9 +62,10 @@ def evaluate_p_r(thres, gdfb, tr_h3, world, plot): ax.set_xlim(minx - .1, maxx + .1) ax.set_ylim(miny - .1, maxy + .1) plt.show() - + return p, r, area + def push_right(geom): def shift_pts(pts): for x, y in pts: @@ -319,6 +80,7 @@ def shift_pts(pts): holes = list() return type(geom)(shell, holes) + def get_prauc(gdfb, tr_h3, plot): bp_h3 = gdfb.copy() if bp_h3.shape[0] == 0: @@ -330,35 +92,39 @@ def get_prauc(gdfb, tr_h3, plot): precision, recall, thresholds = precision_recall_curve(test, predictions) p1 = (2 * precision * recall) p2 = (precision + recall) - out = np.zeros( (len(p1)) ) - fscore = np.divide(p1,p2, out=out, where=p2!=0) + out = np.zeros((len(p1))) + fscore = np.divide(p1, p2, out=out, where=p2 != 0) index = np.argmax(fscore) prthres = thresholds[index] prf1 = fscore[index] prprecision = precision[index] prrecall = recall[index] prauc = auc(recall, precision) - if plot==True: + if plot is True: print("PR AUC: " + str(prauc)) fig, ax = plt.subplots() - ax.plot(recall, precision, color='purple') - ax.plot([recall[index]], [precision[index]], color='green', marker='o') - ax.set_title('Precision-Recall Curve') - ax.set_ylabel('Precision') - ax.set_xlabel('Recall') + ax.plot(recall, precision, color="purple") + ax.plot([recall[index]], [precision[index]], color="green", marker="o") + ax.set_title("Precision-Recall Curve") + ax.set_ylabel("Precision") + ax.set_xlabel("Recall") plt.show() return prauc, prthres, prf1, prprecision, prrecall + def main(args): print("read in the taxonomy...") - taxa = pd.read_csv(args.taxonomy, usecols=["taxon_id","leaf_class_id","iconic_class_id"]).dropna(subset=['leaf_class_id']) + taxa = pd.read_csv( + args.taxonomy, + usecols=["taxon_id", "leaf_class_id", "iconic_class_id"] + ).dropna(subset=["leaf_class_id"]) taxon_ids = taxa.taxon_id if args.stop_after is not None: - taxon_ids = taxon_ids[0:args.stop_after] - mt = ModelTaxonomy(args.taxonomy) + taxon_ids = taxon_ids[0:args.stop_after] + mtd = ModelTaxonomyDataframe(args.taxonomy, None) print("read in the model...") - tfgpm = TFGeoPriorModelEnv(args.model, mt) + tfgpm = TFGeoPriorModelElev(args.model) print("read in the taxon range recalls and thresholds...") taxon_range_recalls = pd.read_csv(args.taxon_range_recalls) @@ -366,10 +132,10 @@ def main(args): print("reading in the elevation and world map...") im = tifffile.imread(args.elevation) - world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) + world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres")) print("processing elevation and making features...") - h3_resolution=4 + h3_resolution = 4 im_df = pd.DataFrame(im) im_df.index = np.linspace(90, -90, 2160) im_df.columns = np.linspace(-180, 180, 4320) @@ -380,110 +146,113 @@ def main(args): im_df.columns = ["lat", "lng", "elevation"] elev_dfh3 = im_df.h3.geo_to_h3(h3_resolution) elev_dfh3 = elev_dfh3.drop( - columns=['lng', 'lat'] - ).groupby("h3_0"+str(h3_resolution)).mean() + columns=["lng", "lat"] + ).groupby("h3_0" + str(h3_resolution)).mean() gdfk = elev_dfh3.h3.h3_to_geo() gdfk["lng"] = gdfk["geometry"].x gdfk["lat"] = gdfk["geometry"].y _ = gdfk.pop("geometry") - gdfk = gdfk.rename_axis('h3index') + gdfk = gdfk.rename_axis("h3index") feats = tfgpm.features_for_one_class_elevation( latitude=list(gdfk.lat), longitude=list(gdfk.lng), elevation=list(gdfk.elevation) ) - + print("looping through the taxa...") eval_output = [] for taxon_id in tqdm(taxon_ids): - #check whether taxon represented in taxon range eval set + # check whether taxon represented in taxon range eval set if taxon_range_recalls[taxon_range_recalls.taxon_id.eq(taxon_id)].shape[0] == 0: continue - if taxon_range_recalls[(taxon_range_recalls['taxon_id'] == taxon_id) & (taxon_range_recalls['r'] > 0.9)].empty: + if taxon_range_recalls[ + (taxon_range_recalls["taxon_id"] == taxon_id) & (taxon_range_recalls["recall"] > 0.9) + ].empty: continue - taxon_range_indicies = args.taxon_range_indicies+"/"+ str(taxon_id) +".csv" - if exists(taxon_range_indicies) == False: + taxon_range_indicies = args.taxon_range_indicies + "/" + str(taxon_id) + ".csv" + if not exists(taxon_range_indicies): continue - - #process taxon range + + # process taxon range try: taxon_range_index = pd.read_csv(taxon_range_indicies, header=None) - taxon_range_index.rename(columns={0: 'h3index_new'}, inplace=True) + taxon_range_index.rename(columns={0: "h3index_new"}, inplace=True) tr_h3 = gdfk.loc[gdfk.index.isin(taxon_range_index.h3index_new)] - except: + except Exception: gc.collect() continue - #get model predictions and threshold + # get model predictions and threshold try: - class_of_interest = mt.node_key_to_leaf_class_id[taxon_id] - except: + class_of_interest = mtd.df.loc[taxon_id]["leaf_class_id"] + except Exception: + print("not in the model for some reason") continue preds = tfgpm.eval_one_class_elevation_from_features(feats, class_of_interest) gdfk["pred"] = tf.squeeze(preds).numpy() - thres = thresholds[thresholds.taxon_id==taxon_id].thres.values[0] - - #get precision, recall, prauc, and f1 + thres = thresholds[thresholds.taxon_id == taxon_id].thres.values[0] + + # get precision, recall, prauc, and f1 p, r, area = evaluate_p_r(thres, gdfk, tr_h3, world, False) - if p == None or r == None or ((p+r)==0): + if p is None or r is None or ((p + r) == 0): f1 = None else: f1 = (2 * p * r) / (p + r) prauc, prthres, prf1, prprecision, prrecall = get_prauc(gdfk, tr_h3, False) area = h3.hex_area(h3_resolution) - - #store results + + # store results row = { "taxon_id": taxon_id, "prauc": prauc, "p": p, "r": r, "f1": f1, - "taxon_range_area": len(tr_h3)*area, + "taxon_range_area": len(tr_h3) * area, } row_dict = dict(row) eval_output.append(row_dict) - + eval_output_pd = pd.DataFrame(eval_output) print("evaluation statistics:") - print("\tPR-AUC: "+str(round(eval_output_pd.prauc.mean(),3))) - print("\tPrecision: "+str(round(eval_output_pd.p.mean(),3))) - print("\tRecall: "+str(round(eval_output_pd.r.mean(),3))) - print("\tF1: "+str(round(eval_output_pd.f1.mean(),3))) + print("\tPR-AUC: " + str(round(eval_output_pd.prauc.mean(), 3))) + print("\tPrecision: " + str(round(eval_output_pd.p.mean(), 3))) + print("\tRecall: " + str(round(eval_output_pd.r.mean(), 3))) + print("\tF1: " + str(round(eval_output_pd.f1.mean(), 3))) print("writing output...") eval_output_pd.to_csv(args.output_path) + if __name__ == "__main__": - - info_str = '\nrun as follows\n' + \ - ' python taxon_range_evaluation.py --elevation wc2.1_5m_elev.tif \n' + \ - ' --model v2_8/no_full_shuffle_50k_buffer.h5 \n' + \ - ' --taxonomy v2_8/taxonomy.csv\n' + \ - ' --thresholds v2_8/tf_env_thresh.csv\n' + \ - ' --taxon_range_recalls v2_8/taxon_range_recalls.csv\n' + \ - ' --taxon_range_indicies v2_8/taxon_range_indicies\n' + \ - ' --output_path v2_8/tf_env_eval_test.csv\n' + \ - ' --stop_after 10\n' - + + info_str = "\nrun as follows\n" + \ + " python taxon_range_evaluation.py --elevation wc2.1_5m_elev.tif \n" + \ + " --model v2_8/no_full_shuffle_50k_buffer.h5 \n" + \ + " --taxonomy v2_8/taxonomy.csv\n" + \ + " --thresholds v2_8/tf_env_thresh.csv\n" + \ + " --taxon_range_recalls v2_8/taxon_range_recalls.csv\n" + \ + " --taxon_range_indicies v2_8/taxon_range_indicies\n" + \ + " --output_path v2_8/tf_env_eval_test.csv\n" + \ + " --stop_after 10\n" + parser = argparse.ArgumentParser(usage=info_str) - parser.add_argument('--elevation', type=str, - help='Path to elev tif.', required=True) - parser.add_argument('--model', type=str, - help='Path to tf model.', required=True) - parser.add_argument('--taxonomy', type=str, - help='Path to taxonomy csv.', required=True) - parser.add_argument('--thresholds', type=str, - help='Path to thresholds csv.', required=True) - parser.add_argument('--taxon_range_recalls', type=str, - help='Path to taxon_range_recalls csv.', required=True) - parser.add_argument('--taxon_range_indicies', type=str, - help='Path to indices dir.', required=True) - parser.add_argument('--output_path', type=str, - help='file to write thesholds.', required=True) - parser.add_argument('--stop_after', type=int, - help='just run the first x taxa') + parser.add_argument("--elevation", type=str, + help="Path to elev tif.", required=True) + parser.add_argument("--model", type=str, + help="Path to tf model.", required=True) + parser.add_argument("--taxonomy", type=str, + help="Path to taxonomy csv.", required=True) + parser.add_argument("--thresholds", type=str, + help="Path to thresholds csv.", required=True) + parser.add_argument("--taxon_range_recalls", type=str, + help="Path to taxon_range_recalls csv.", required=True) + parser.add_argument("--taxon_range_indicies", type=str, + help="Path to indices dir.", required=True) + parser.add_argument("--output_path", type=str, + help="file to write thesholds.", required=True) + parser.add_argument("--stop_after", type=int, + help="just run the first x taxa") args = parser.parse_args() main(args) - diff --git a/tests/test_inat_inferrer.py b/tests/test_inat_inferrer.py index 9ba1a1c..26967f3 100644 --- a/tests/test_inat_inferrer.py +++ b/tests/test_inat_inferrer.py @@ -18,7 +18,7 @@ def test_initialization(self, inatInferrer): ) tf.keras.models.load_model.assert_any_call( inatInferrer.config["tf_geo_elevation_model_path"], - custom_objects={'ResLayer': ResLayer}, + custom_objects={"ResLayer": ResLayer}, compile=False ) diff --git a/tests/test_model_taxonomy.py b/tests/test_model_taxonomy.py deleted file mode 100644 index 167fff7..0000000 --- a/tests/test_model_taxonomy.py +++ /dev/null @@ -1,54 +0,0 @@ -import pytest -import os -from lib.model_taxonomy import ModelTaxonomy - - -@pytest.fixture() -def taxonomy(): - yield ModelTaxonomy( - os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv") - ) - - -@pytest.fixture() -def taxon(request, taxonomy): - yield next(v for k, v in taxonomy.taxa.items() if v.name == request.param) - - -class TestModelTaxonomyDataframe: - def test_raise_error_on_missing_path(self): - with pytest.raises(FileNotFoundError): - ModelTaxonomy( - os.path.realpath("nonsense") - ) - - @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) - def test_loading_mapping(self, taxon): - assert taxon.id == 7 - assert taxon.parent_id == 6 - assert taxon.rank_level == 10 - assert taxon.leaf_class_id == 1 - assert taxon.name == "Aramus guarauna" - - @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) - def test_nested_set_assigning(self, taxon): - assert taxon.left == 7 - assert taxon.right == 8 - - def test_children_of_root(self, taxonomy): - children = taxonomy.taxon_children[0] - assert len(children) == 2 - assert taxonomy.taxa[children[0]].name == "Animalia" - assert taxonomy.taxa[children[1]].name == "Plantae" - - @pytest.mark.parametrize("taxon", ["Animalia"], indirect=True) - def test_children_of_taxon(self, taxonomy, taxon): - children = taxonomy.taxon_children[taxon.id] - assert len(children) == 1 - assert taxonomy.taxa[children[0]].name == "Chordata" - - def test_print(self, capsys, taxonomy): - taxonomy.print() - captured = capsys.readouterr() - assert "├──Animalia :: 0:23" in captured.out - assert "│ └──Chordata :: 1:22" in captured.out diff --git a/tests/test_taxon.py b/tests/test_taxon.py deleted file mode 100644 index 171cd48..0000000 --- a/tests/test_taxon.py +++ /dev/null @@ -1,17 +0,0 @@ -from lib.taxon import Taxon - - -class TestTaxon: - def test_initialization(self): - taxon = Taxon({"id": 0, "name": "Life"}) - assert taxon.name == "Life" - - def test_is_or_descendant_of_self(self): - taxon = Taxon({"id": 1}) - assert taxon.is_or_descendant_of(taxon) - - def test_is_or_descendant_of_taxon(self): - parent_taxon = Taxon({"id": 1, "left": 0, "right": 3}) - child_taxon = Taxon({"id": 2, "left": 1, "right": 2}) - assert child_taxon.is_or_descendant_of(parent_taxon) - assert not parent_taxon.is_or_descendant_of(child_taxon) diff --git a/tests/test_tf_gp_elev_model.py b/tests/test_tf_gp_elev_model.py index 65f0dae..8c01a3b 100644 --- a/tests/test_tf_gp_elev_model.py +++ b/tests/test_tf_gp_elev_model.py @@ -16,7 +16,7 @@ def test_initialization(self, mocker): TFGeoPriorModelElev(model_path) tf.keras.models.load_model.assert_called_once_with( model_path, - custom_objects={'ResLayer': ResLayer}, + custom_objects={"ResLayer": ResLayer}, compile=False ) From c4a946f88aa62eacb194b50862a7e14256a418b8 Mon Sep 17 00:00:00 2001 From: Patrick Leary Date: Fri, 9 Feb 2024 10:51:40 -0500 Subject: [PATCH 2/4] refactor vision_testing to record obs-level stats, test directories, output to CSV; alternate aggregated scoring --- lib/inat_inferrer.py | 1 + lib/inat_vision_api.py | 112 +++++-- lib/model_taxonomy_dataframe.py | 21 +- lib/model_test_data_exporter.py | 4 +- lib/templates/home.html | 4 +- lib/test_observation.py | 3 + lib/vision_testing.py | 521 ++++++++++++++++++-------------- test_model.py | 14 +- 8 files changed, 423 insertions(+), 257 deletions(-) diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index 26e8d57..355b4e8 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -278,6 +278,7 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, print("\nTree of aggregated results:") ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( lambda row: f"{row.name} [" + f"ID:{row.taxon_id}, " f"V:{round(row.aggregated_vision_score, 4)}, " f"G:{round(row.aggregated_geo_score, 4)}, " f"C:{round(row.aggregated_combined_score, 4)}, " diff --git a/lib/inat_vision_api.py b/lib/inat_vision_api.py index 42fd8b0..e6adfd0 100644 --- a/lib/inat_vision_api.py +++ b/lib/inat_vision_api.py @@ -4,10 +4,12 @@ import urllib import uuid import json +import pandas as pd from flask import Flask, request, render_template from web_forms import ImageForm from inat_inferrer import InatInferrer +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe class InatVisionAPI: @@ -107,6 +109,52 @@ def index_route(self): else: return render_template("home.html") + def best_leaves_from_aggregated_results(self, aggregated_results, iteration=0): + # use a lower threshold on the first pass to have higher representation from + # original model leaf taxa + selection_score_threshold = 0.05 if iteration == 0 else 0.1 + remaining_results = aggregated_results.query( + f"selection_score > {selection_score_threshold}" + ) + # set a rank level cutoff on higher taxa to include in results + if iteration > 0: + remaining_results = remaining_results.query( + "rank_level <= 30" + ) + # after setting a cutoff, get the parent IDs of the remaining taxa + parent_taxon_ids = remaining_results["parent_taxon_id"].values # noqa: F841 + # the leaves of the pruned taxonomy (not leaves of the original taxonomy), are the + # taxa who are not parents of any remaining taxa + leaf_results = remaining_results.query("taxon_id not in @parent_taxon_ids") + + # lower the scores of ancestors by the scores of the taxa being moved into the result set + for selection_score, aggregated_combined_score, left, right in zip( + leaf_results["selection_score"], + leaf_results["aggregated_combined_score"], + leaf_results["left"], + leaf_results["right"] + ): + self_and_ancestors = remaining_results.query( + f"left <= {left} and right >= {right}" + ) + remaining_results.loc[ + self_and_ancestors.index, + "selection_score" + ] -= selection_score + remaining_results.loc[ + self_and_ancestors.index, + "aggregated_combined_score" + ] -= aggregated_combined_score + + # stop picking taxa if one represents more than 80% of aggregated scores + if leaf_results["normalized_aggregated_combined_score"].max() >= 0.8: + remaining_results = pd.DataFrame() + else: + remaining_results = remaining_results.query( + "selection_score > 0.1" + ) + return [leaf_results, remaining_results] + def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): score_without_geo = (form.score_without_geo.data == "true") filter_taxon = self.inferrer.lookup_taxon(iconic_taxon_id) @@ -117,6 +165,50 @@ def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): if form.aggregated.data == "true": aggregated_results = self.inferrer.aggregate_results(leaf_scores, filter_taxon, score_without_geo) + if form.format.data == "tree": + aggregated_results = aggregated_results.query( + "normalized_aggregated_combined_score > 0.001" + ) + printable_tree = ModelTaxonomyDataframe.printable_tree( + aggregated_results, + display_taxon_lambda=( + lambda row: f"{row.name}\t\t[" + f"ID:{row.taxon_id}, " + f"V:{round(row.aggregated_vision_score, 4)}, " + f"G:{round(row.aggregated_geo_score, 4)}, " + f"C:{round(row.aggregated_combined_score, 4)}, " + f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" + ) + ) + return "
" + "
".join(printable_tree) + "
" + + aggregated_results = aggregated_results.query( + "normalized_aggregated_combined_score > 0.05" + ) + + aggregated_results["selection_score"] = aggregated_results[ + "normalized_aggregated_combined_score" + ] + iteration = 0 + leaf_results, remaining_results = self.best_leaves_from_aggregated_results( + aggregated_results, iteration + ) + while len(remaining_results.index) > 0: + iteration += 1 + next_leaf_results, remaining_results = self.best_leaves_from_aggregated_results( + remaining_results, iteration + ) + leaf_results = pd.concat([leaf_results, next_leaf_results]) + + leaf_results = leaf_results.sort_values( + "aggregated_combined_score", + ascending=False + ).head(100) + + score_columns = ["aggregated_combined_score", "aggregated_geo_score", + "aggregated_vision_score", "aggregated_geo_threshold"] + leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index") + columns_to_return = [ "aggregated_combined_score", "aggregated_geo_score", @@ -132,26 +224,6 @@ def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): "aggregated_vision_score": "vision_score", "aggregated_geo_threshold": "geo_threshold" } - - # set a cutoff where branches whose combined scores are below the threshold are ignored - # TODO: this threshold is completely arbitrary and needs testing - aggregated_results = aggregated_results.query( - "normalized_aggregated_combined_score > 0.05" - ) - - # after setting a cutoff, get the parent IDs of the remaining taxa - parent_taxon_ids = aggregated_results["parent_taxon_id"].values # noqa: F841 - # the leaves of the pruned taxonomy (not leaves of the original taxonomy), are the - # taxa who are not parents of any remaining taxa - leaf_results = aggregated_results.query("taxon_id not in @parent_taxon_ids") - - leaf_results = leaf_results.sort_values( - "aggregated_combined_score", - ascending=False - ).head(100) - score_columns = ["aggregated_combined_score", "aggregated_geo_score", - "aggregated_vision_score", "aggregated_geo_threshold"] - leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index") final_results = leaf_results[columns_to_return].rename(columns=column_mapping) else: top_combined_score = leaf_scores.sort_values( diff --git a/lib/model_taxonomy_dataframe.py b/lib/model_taxonomy_dataframe.py index 4264d1f..d792e41 100644 --- a/lib/model_taxonomy_dataframe.py +++ b/lib/model_taxonomy_dataframe.py @@ -56,10 +56,10 @@ def load_mapping(self, path, thresholds_path): def assign_nested_values(self, taxon_id=0, index=0, ancestor_taxon_ids=[]): for child_id in self.taxon_children[taxon_id]: self.df.at[self.taxon_row_mapping[child_id], "left"] = index - self.taxon_ancestors[child_id] = ancestor_taxon_ids + child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id] + self.taxon_ancestors[child_id] = child_ancestor_taxon_ids index += 1 if child_id in self.taxon_children: - child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id] index = self.assign_nested_values(child_id, index, child_ancestor_taxon_ids) self.df.at[self.taxon_row_mapping[child_id], "right"] = index index += 1 @@ -73,26 +73,35 @@ def children(df, taxon_id): @staticmethod def print(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): + print("\n".join(ModelTaxonomyDataframe.printable_tree( + df, taxon_id, ancestor_prefix, display_taxon_lambda + ))) + + @staticmethod + def printable_tree(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): children = ModelTaxonomyDataframe.children(df, taxon_id) index = 0 if "aggregated_combined_score" in children: children = children.sort_values("aggregated_combined_score", ascending=False) else: children = children.sort_values("name") + linesToPrint = [] for row in children.itertuples(): last_in_branch = (index == len(children) - 1) index += 1 icon = "└──" if last_in_branch else "├──" prefixIcon = " " if last_in_branch else "│ " - print(f"{ancestor_prefix}{icon}", end="") + lineToPrint = f"{ancestor_prefix}{icon}" if display_taxon_lambda is None: - print(f"{row.name} :: {row.left}:{row.right}") + lineToPrint += f"{row.name} :: {row.left}:{row.right}" else: - print(display_taxon_lambda(row)) + lineToPrint += display_taxon_lambda(row) + linesToPrint.append(lineToPrint) if row.right != row.left + 1: - ModelTaxonomyDataframe.print( + linesToPrint += ModelTaxonomyDataframe.printable_tree( df, row.taxon_id, f"{ancestor_prefix}{prefixIcon}", display_taxon_lambda ) + return linesToPrint diff --git a/lib/model_test_data_exporter.py b/lib/model_test_data_exporter.py index f0b32c0..1fa554f 100644 --- a/lib/model_test_data_exporter.py +++ b/lib/model_test_data_exporter.py @@ -89,7 +89,7 @@ async def generate_test_data(self): await self.fetch_more_data() async def fetch_more_data(self): - self.queue = asyncio.Queue(ModelTestDataExporter.N_WORKERS) + self.queue = asyncio.Queue() self.workers = [asyncio.create_task(self.worker_task()) for _ in range(ModelTestDataExporter.N_WORKERS)] min_pages_remaining = math.ceil( @@ -97,7 +97,7 @@ async def fetch_more_data(self): ) print(f"Queueing {min_pages_remaining} workers") for i in range(min_pages_remaining): - await self.queue.put(i) + self.queue.put_nowait(i) await self.queue.join() for worker in self.workers: worker.cancel() diff --git a/lib/templates/home.html b/lib/templates/home.html index 37244e9..649d3ee 100644 --- a/lib/templates/home.html +++ b/lib/templates/home.html @@ -23,18 +23,20 @@

Slim vs Legacy Model

Lng:

+
+
+