diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..6c49b0b --- /dev/null +++ b/Pipfile @@ -0,0 +1,3 @@ +[scripts] +tests = "pytest -s" +coverage = "bash -c 'coverage run -m pytest -s && coverage report --show-missing'" diff --git a/forms.py b/forms.py index 420dd8c..26f0371 100644 --- a/forms.py +++ b/forms.py @@ -1,8 +1,11 @@ from flask_wtf import FlaskForm from flask_wtf.file import FileField, FileRequired + class ImageForm(FlaskForm): - image = FileField('image', + image = FileField( + "image", validators=[ FileRequired(message="Please include 'image' field.") - ]) + ] + ) diff --git a/generate_thresholds.py b/generate_thresholds.py index 3f0bd14..47ff87a 100644 --- a/generate_thresholds.py +++ b/generate_thresholds.py @@ -4,218 +4,29 @@ import argparse import tifffile -import os import pandas as pd import numpy as np import h3 -import h3pandas +import h3pandas # noqa: F401 import tensorflow as tf -import csv -import math -import json from tqdm.auto import tqdm -import tensorflow as tf from sklearn.metrics import precision_recall_curve -import matplotlib.pyplot as plt import warnings +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe +from lib.tf_gp_elev_model import TFGeoPriorModelElev -class ResLayer(tf.keras.layers.Layer): - def __init__(self): - super(ResLayer, self).__init__() - self.w1 = tf.keras.layers.Dense( - 256, activation="relu", kernel_initializer="he_normal" - ) - self.w2 = tf.keras.layers.Dense( - 256, activation="relu", kernel_initializer="he_normal" - ) - self.dropout = tf.keras.layers.Dropout(rate=0.5) - self.add = tf.keras.layers.Add() - - def call(self, inputs): - x = self.w1(inputs) - x = self.dropout(x) - x = self.w2(x) - x = self.add([x, inputs]) - return x - - def get_config(self): - return {} - -class Taxon: - - def __init__(self, row): - for key in row: - setattr(self, key, row[key]) - - def set(self, attr, val): - setattr(self, attr, val) - - def is_or_descendant_of(self, taxon): - if self.id == taxon.id: - return True - return self.descendant_of(taxon) - - # using the nested set left and right values, a taxon is a descendant of another - # as long as its left is higher and its right is lower - def descendant_of(self, taxon): - return self.left > taxon.left and self.right < taxon.right - -class ModelTaxonomy: - - def __init__(self, path): - self.load_mapping(path) - self.assign_nested_values() - - def load_mapping(self, path): - self.node_key_to_leaf_class_id = {} - self.leaf_class_to_taxon = {} - # there is no taxon with ID 0, but roots of the taxonomy with have a parent ID of 0, - # so create a fake taxon of Life to represent the root of the entire tree - self.taxa = {0: Taxon({"name": "Life", "depth": 0})} - self.taxon_children = {} - try: - with open(path) as csv_file: - csv_reader = csv.DictReader(csv_file, delimiter=",") - for row in csv_reader: - taxon_id = int(row["taxon_id"]) - rank_level = float(row["rank_level"]) - leaf_class_id = int(row["leaf_class_id"]) if row["leaf_class_id"] else None - parent_id = int(row["parent_taxon_id"]) if row["parent_taxon_id"] else 0 - # some taxa are not leaves and aren't represented in the leaf layer - if leaf_class_id is not None: - self.node_key_to_leaf_class_id[taxon_id] = leaf_class_id - self.leaf_class_to_taxon[leaf_class_id] = taxon_id - self.taxa[taxon_id] = Taxon({ - "id": taxon_id, - "name": row["name"], - "parent_id": parent_id, - "leaf_class_id": leaf_class_id, - "rank_level": rank_level - }) - if parent_id not in self.taxon_children: - self.taxon_children[parent_id] = [] - self.taxon_children[parent_id].append(taxon_id) - except IOError as e: - print(e) - print(f"\n\nCannot open mapping file `{path}`\n\n") - raise e - - # prints to the console a representation of this tree - def print(self, taxon_id=0, ancestor_prefix=""): - children = self.taxon_children[taxon_id] - index = 0 - for child_id in children: - last_in_branch = (index == len(children) - 1) - index += 1 - icon = "└──" if last_in_branch else "├──" - prefixIcon = " " if last_in_branch else "│ " - taxon = self.taxa[child_id] - print(f'{ancestor_prefix}{icon}{taxon.name} :: {taxon.left}:{taxon.right}') - if child_id in self.taxon_children: - self.print(child_id, f"{ancestor_prefix}{prefixIcon}") - - # calculated nested set left and right values and depth representing how many nodes - # down the taxon is from Life. These can be later used for an efficient way to calculate - # if a taxon is a descendant of another - def assign_nested_values(self, taxon_id=0, index=0, depth=1, ancestors=[]): - for child_id in self.taxon_children[taxon_id]: - self.taxa[child_id].set("left", index) - self.taxa[child_id].set("depth", depth) - self.taxa[child_id].set("ancestors", ancestors) - index += 1 - if child_id in self.taxon_children: - child_ancestors = ancestors + [child_id] - index = self.assign_nested_values(child_id, index, depth + 1, child_ancestors) - self.taxa[child_id].set("right", index) - index += 1 - return index - - -class TFGeoPriorModelEnv: - - def __init__(self, model, taxonomy): - self.taxonomy = taxonomy - # initialize the geo model for inference - self.gpmodel = tf.keras.models.load_model( - model, - custom_objects={'ResLayer': ResLayer}, - compile=False - ) - - - def features_for_one_class_elevation(self, latitude, longitude, elevation): - """Evalutes the model for a single class and multiple locations - - Args: - latitude (list): A list of latitudes - longitude (list): A list of longitudes (same length as latitude) - elevation (list): A list of elevations (same length as latitude) - class_of_interest (int): The single class to eval - - Returns: - numpy array: scores for class of interest at each location - """ - def encode_loc(latitude, longitude, elevation): - latitude = np.array(latitude) - longitude = np.array(longitude) - elevation = np.array(elevation) - elevation = elevation.astype("float32") - grid_lon = longitude.astype('float32') / 180.0 - grid_lat = latitude.astype('float32') / 90.0 - - elevation[elevation>0] = elevation[elevation>0]/6574.0 - elevation[elevation<0] = elevation[elevation<0]/32768.0 - norm_elev = elevation - - if np.isscalar(grid_lon): - grid_lon = np.array([grid_lon]) - if np.isscalar(grid_lat): - grid_lat = np.array([grid_lat]) - if np.isscalar(norm_elev): - norm_elev = np.array([norm_elev]) - - norm_loc = tf.stack([grid_lon, grid_lat], axis=1) - - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi), - tf.expand_dims(norm_elev, axis=1), - - ], axis=1) - - return encoded_loc - - encoded_loc = encode_loc(latitude, longitude, elevation) - loc_emb = self.gpmodel.layers[0](encoded_loc) - - # res layers - feature extraction - x = self.gpmodel.layers[1](loc_emb) - x = self.gpmodel.layers[2](x) - x = self.gpmodel.layers[3](x) - x = self.gpmodel.layers[4](x) - - # process just the one class - return x - - def eval_one_class_elevation_from_features(self, x, class_of_interest): - return tf.keras.activations.sigmoid( - tf.matmul( - x, - tf.expand_dims(self.gpmodel.layers[5].weights[0][:,class_of_interest], axis=0), - transpose_b=True - ) - ).numpy() def ignore_shapely_deprecation_warning(message, category, filename, lineno, file=None, line=None): if "array interface is deprecated" in str(message): return None return warnings.defaultaction(message, category, filename, lineno, file, line) + def main(args): print("loading in the model...") - mt = ModelTaxonomy(args.taxonomy) - tfgpm = TFGeoPriorModelEnv(args.model, mt) - + mtd = ModelTaxonomyDataframe(args.taxonomy, None) + tfgpm = TFGeoPriorModelElev(args.model) + print("setting up the map...") warnings.showwarning = ignore_shapely_deprecation_warning im = tifffile.imread(args.elevation) @@ -229,13 +40,13 @@ def main(args): im_df.columns = ["lat", "lng", "elevation"] elev_dfh3 = im_df.h3.geo_to_h3(args.h3_resolution) elev_dfh3 = elev_dfh3.drop( - columns=['lng', 'lat'] - ).groupby("h3_0"+str(args.h3_resolution)).mean() + columns=["lng", "lat"] + ).groupby("h3_0" + str(args.h3_resolution)).mean() gdfk = elev_dfh3.h3.h3_to_geo() gdfk["lng"] = gdfk["geometry"].x gdfk["lat"] = gdfk["geometry"].y _ = gdfk.pop("geometry") - gdfk = gdfk.rename_axis('h3index') + gdfk = gdfk.rename_axis("h3index") print("making features...") feats = tfgpm.features_for_one_class_elevation( @@ -245,13 +56,20 @@ def main(args): ) print("loading in the training data...") - train_df = pd.read_csv(args.train_spatial_data, - usecols=["taxon_id","latitude","longitude","captive"]).rename({ + train_df = pd.read_csv( + args.train_spatial_data, + usecols=[ + "taxon_id", + "latitude", + "longitude", + "captive" + ] + ).rename({ "latitude": "lat", "longitude": "lng" }, axis=1) - train_df = train_df[train_df.captive==0] #no-CID ok, wild only - train_df.drop(["captive"],axis=1) + train_df = train_df[train_df.captive == 0] # no-CID ok, wild only + train_df.drop(["captive"], axis=1) train_df_h3 = train_df.h3.geo_to_h3(args.h3_resolution) all_spatial_grid_counts = train_df_h3.index.value_counts() presence_absence = pd.DataFrame({ @@ -261,89 +79,103 @@ def main(args): print("...looping through taxa") output = [] - taxa = pd.read_csv(args.taxonomy, usecols=["taxon_id","leaf_class_id","iconic_class_id"]).dropna(subset=['leaf_class_id']) + taxa = pd.read_csv( + args.taxonomy, + usecols=[ + "taxon_id", + "leaf_class_id", + "iconic_class_id" + ] + ).dropna(subset=["leaf_class_id"]) taxon_ids = taxa.taxon_id if args.stop_after is not None: - taxon_ids = taxon_ids[0:args.stop_after] - desired_recall = 0.95 + taxon_ids = taxon_ids[0:args.stop_after] resolution = args.h3_resolution area = h3.hex_area(resolution) for taxon_id in tqdm(taxon_ids): try: - class_of_interest = mt.node_key_to_leaf_class_id[taxon_id] - except: - print('not in the model for some reason') + class_of_interest = mtd.df.loc[taxon_id]["leaf_class_id"] + except Exception: + print("not in the model for some reason") continue - #get predictions + # get predictions preds = tfgpm.eval_one_class_elevation_from_features(feats, class_of_interest) gdfk["pred"] = tf.squeeze(preds).numpy() - - #make presence absence dataset - target_spatial_grid_counts = train_df_h3[train_df_h3.taxon_id==taxon_id].index.value_counts() + + # make presence absence dataset + target_spatial_grid_counts = \ + train_df_h3[train_df_h3.taxon_id == taxon_id].index.value_counts() presences = gdfk.loc[target_spatial_grid_counts.index]["pred"] if len(presences) == 0: print("not present") continue - - #calculate threhold + + # calculate threhold presence_absence["forground"] = target_spatial_grid_counts presence_absence["predictions"] = gdfk["pred"] presence_absence.forground = presence_absence.forground.fillna(0) - yield_cutoff = np.percentile((presence_absence["background"]/presence_absence["forground"])[presence_absence["forground"]>0], 95) - absences = presence_absence[(presence_absence["forground"]==0) & (presence_absence["background"] > yield_cutoff)]["predictions"] - presences = presence_absence[(presence_absence["forground"]>0)]["predictions"] - df_x = pd.DataFrame({'predictions': presences, 'test': 1}) - df_y = pd.DataFrame({'predictions': absences, 'test': 0}) + yield_cutoff = np.percentile(( + presence_absence["background"] / presence_absence["forground"] + )[presence_absence["forground"] > 0], 95) + absences = presence_absence[ + (presence_absence["forground"] == 0) & (presence_absence["background"] > yield_cutoff) + ]["predictions"] + presences = presence_absence[(presence_absence["forground"] > 0)]["predictions"] + df_x = pd.DataFrame({"predictions": presences, "test": 1}) + df_y = pd.DataFrame({"predictions": absences, "test": 0}) for_thres = pd.concat([df_x, df_y], ignore_index=False) - precision, recall, thresholds = precision_recall_curve(for_thres.test, for_thres.predictions) + precision, recall, thresholds = precision_recall_curve( + for_thres.test, + for_thres.predictions + ) p1 = (2 * precision * recall) p2 = (precision + recall) - out = np.zeros( (len(p1)) ) - fscore = np.divide(p1,p2, out=out, where=p2!=0) + out = np.zeros((len(p1))) + fscore = np.divide(p1, p2, out=out, where=p2 != 0) index = np.argmax(fscore) thres = thresholds[index] - - #store daa + + # store daa row = { "taxon_id": taxon_id, "thres": thres, - "area": len(gdfk[gdfk.pred >= thres])*area + "area": len(gdfk[gdfk.pred >= thres]) * area } row_dict = dict(row) output.append(row_dict) - + print("writing output...") output_pd = pd.DataFrame(output) - output_pd.to_csv(args.output_dir+"/thresholds.csv") + output_pd.to_csv(args.output_dir + "/thresholds.csv") + if __name__ == "__main__": - - info_str = '\nrun as follows\n' + \ - ' python generate_thresholds.py --elevation wc2.1_5m_elev.tif \n' + \ - ' --model v2_6/tf_geoprior_2_5_r6_elevation.h5 \n' + \ - ' --taxonomy taxonomy_1_4.csv\n' + \ - ' --train_spatial_data v2_6/taxonomy.csv\n' + \ - ' --output_dir v2_6\n' + \ - ' --h3_resolution 4\n' + \ - ' --stop_after 10\n' - + info_str = "\nrun as follows\n" + \ + " python generate_thresholds.py --elevation wc2.1_5m_elev.tif \n" + \ + " --model v2_6/tf_geoprior_2_5_r6_elevation.h5 \n" + \ + " --taxonomy taxonomy_1_4.csv\n" + \ + " --train_spatial_data v2_6/taxonomy.csv\n" + \ + " --output_dir v2_6\n" + \ + " --h3_resolution 4\n" + \ + " --stop_after 10\n" + parser = argparse.ArgumentParser(usage=info_str) - parser.add_argument('--elevation', type=str, - help='Path to elev tif.', required=True) - parser.add_argument('--model', type=str, - help='Path to tf model.', required=True) - parser.add_argument('--taxonomy', type=str, - help='Path to taxonomy csv.', required=True) - parser.add_argument('--train_spatial_data', type=str, - help='Path to train csv for occupancy.', required=True) - parser.add_argument('--output_dir', type=str, - help='directory to write thesholds.', required=True) - parser.add_argument('--h3_resolution', type=int, default=4, - help='grid resolution from 0 - 15, lower numbers are coarser/faster. Currently using 4') - parser.add_argument('--stop_after', type=int, - help='just run the first x taxa') + parser.add_argument("--elevation", type=str, + help="Path to elev tif.", required=True) + parser.add_argument("--model", type=str, + help="Path to tf model.", required=True) + parser.add_argument("--taxonomy", type=str, + help="Path to taxonomy csv.", required=True) + parser.add_argument("--train_spatial_data", type=str, + help="Path to train csv for occupancy.", required=True) + parser.add_argument("--output_dir", type=str, + help="directory to write thesholds.", required=True) + parser.add_argument("--h3_resolution", type=int, default=4, + help="grid resolution from 0 - 15, lower numbers are coarser/faster. " + "Currently using 4") + parser.add_argument("--stop_after", type=int, + help="just run the first x taxa") args = parser.parse_args() main(args) - \ No newline at end of file diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index f9eb0aa..3338504 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -13,53 +13,117 @@ from lib.vision_inferrer import VisionInferrer from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe -# TODO: better way to address the SettingWithCopyWarning warning? -pd.options.mode.chained_assignment = None - -MINIMUM_GEO_SCORE = 0.005 +pd.options.mode.copy_on_write = True class InatInferrer: + MINIMUM_GEO_SCORE = 0.005 + COMMON_ANCESTOR_CUTOFF_RATIO = 0.01 + COMMON_ANCESTOR_WINDOW = 15 + def __init__(self, config): self.config = config - self.setup_taxonomy(config) - self.setup_synonyms(config) - self.setup_vision_model(config) - self.setup_elevation_dataframe(config) - self.setup_geo_model(config) + self.setup_taxonomy() + self.setup_synonyms() + self.setup_synonym_taxonomy() + self.setup_vision_model() + self.setup_elevation_dataframe() + self.setup_geo_model() self.upload_folder = "static/" - def setup_taxonomy(self, config): + def setup_taxonomy(self): self.taxonomy = ModelTaxonomyDataframe( - config["taxonomy_path"], - config["tf_elev_thresholds"] if "tf_elev_thresholds" in config else None + self.config["taxonomy_path"], + self.config["tf_elev_thresholds"] if "tf_elev_thresholds" in self.config else None ) - def setup_synonyms(self, config): + def setup_synonyms(self): self.synonyms = None - if "synonyms_path" in config: - if not os.path.exists(config["synonyms_path"]): - return None - self.synonyms = pd.read_csv(config["synonyms_path"]) + if "synonyms_path" not in self.config: + return + + if not os.path.exists(self.config["synonyms_path"]): + return + + self.synonyms = pd.read_csv( + self.config["synonyms_path"], + dtype={ + "model_taxon_id": int, + "parent_taxon_id": "Int64", + "taxon_id": "Int64", + "rank_level": float, + "name": pd.StringDtype() + } + ) + + def setup_synonym_taxonomy(self): + if self.synonyms is None: + return - def setup_vision_model(self, config): - self.vision_inferrer = VisionInferrer(config["vision_model_path"]) + if "synonyms_taxonomy_path" not in self.config: + return - def setup_elevation_dataframe(self, config): + synonym_taxonomy = ModelTaxonomyDataframe( + self.config["synonyms_taxonomy_path"], + self.config["tf_elev_thresholds"] if "tf_elev_thresholds" in self.config else None + ) + # ensure the leaf_class_ids from the synonym taxonomy are identical + # to the taxonomy generated at data export time + if not self.taxonomy.leaf_df.index.equals(synonym_taxonomy.leaf_df.index): + error = "Synonym taxonomy does not match the model taxonomy" + print(error) + raise RuntimeError(error) + + synonym_taxon_ids = np.unique(pd.array(self.synonyms["taxon_id"].dropna().values)) + synonym_taxonomy_taxon_ids = np.unique( + pd.array(synonym_taxonomy.df[ + synonym_taxonomy.df.taxon_id.isin(synonym_taxon_ids) + ]["taxon_id"].values) + ) + synonym_taxon_ids_not_present_in_taxonomy = np.setdiff1d( + synonym_taxon_ids, synonym_taxonomy_taxon_ids + ) + # ensure all taxa referenced in the synonym mappings file are present in the + # updated taxonomy that should include all original taxa plus all synonyms + if synonym_taxon_ids_not_present_in_taxonomy.size > 0: + error = "There are taxa in the synonyms file not present in the synonyms " + \ + f"taxonomy: {synonym_taxon_ids_not_present_in_taxonomy}" + print(error) + raise RuntimeError(error) + + synonym_taxonomy.leaf_df["has_synonyms"] = False + # mark taxa that should be replaced or removed as having synonyms + for index, taxon in self.taxonomy.leaf_df[self.taxonomy.leaf_df["taxon_id"].isin( + self.synonyms["model_taxon_id"] + )].iterrows(): + synonym_taxonomy.leaf_df.loc[taxon["leaf_class_id"], "has_synonyms"] = True + + # replace the originally exported taxonomy with the updated taxonomy that includes synonyms + self.taxonomy = synonym_taxonomy + + def setup_vision_model(self): + self.vision_inferrer = VisionInferrer(self.config["vision_model_path"]) + + def setup_elevation_dataframe(self): self.geo_elevation_cells = None + if "elevation_h3_r4" not in self.config: + return + # load elevation data stored at H3 resolution 4 - if "elevation_h3_r4" in config: - self.geo_elevation_cells = pd.read_csv(config["elevation_h3_r4"]). \ - sort_values("h3_04").set_index("h3_04").sort_index() - self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe(self.geo_elevation_cells) + self.geo_elevation_cells = pd.read_csv(self.config["elevation_h3_r4"]). \ + sort_values("h3_04").set_index("h3_04").sort_index() + self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe( + self.geo_elevation_cells + ) - def setup_elevation_dataframe_from_worldclim(self, config, resolution): + def setup_elevation_dataframe_from_worldclim(self, resolution): # preventing from processing at too high a resolution if resolution > 5: return - if "wc2.1_5m_elev_2.tif" in config: - im = tifffile.imread(config["wc2.1_5m_elev_2.tif"]) + + if "wc2.1_5m_elev_2.tif" in self.config: + im = tifffile.imread(self.config["wc2.1_5m_elev_2.tif"]) im_df = pd.DataFrame(im) im_df.index = np.linspace(90, -90, 2160) im_df.columns = np.linspace(-180, 180, 4320) @@ -67,18 +131,23 @@ def setup_elevation_dataframe_from_worldclim(self, config, resolution): im_df = im_df.melt(id_vars=["index"]) im_df.columns = ["lat", "lng", "elevation"] elev_dfh3 = im_df.h3.geo_to_h3(resolution) - elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f'h3_0{resolution}').mean() + elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f"h3_0{resolution}").mean() - def setup_geo_model(self, config): + def setup_geo_model(self): self.geo_elevation_model = None self.geo_model_features = None - if "tf_geo_elevation_model_path" in config and self.geo_elevation_cells is not None: - self.geo_elevation_model = TFGeoPriorModelElev(config["tf_geo_elevation_model_path"]) - self.geo_model_features = self.geo_elevation_model.features_for_one_class_elevation( - latitude=list(self.geo_elevation_cells.lat), - longitude=list(self.geo_elevation_cells.lng), - elevation=list(self.geo_elevation_cells.elevation) - ) + if "tf_geo_elevation_model_path" not in self.config: + return + + if self.geo_elevation_cells is None: + return + + self.geo_elevation_model = TFGeoPriorModelElev(self.config["tf_geo_elevation_model_path"]) + self.geo_model_features = self.geo_elevation_model.features_for_one_class_elevation( + latitude=list(self.geo_elevation_cells.lat), + longitude=list(self.geo_elevation_cells.lng), + elevation=list(self.geo_elevation_cells.elevation) + ) def vision_predict(self, image, debug=False): if debug: @@ -93,8 +162,10 @@ def geo_model_predict(self, lat, lng, debug=False): start_time = time.time() if lat is None or lat == "" or lng is None or lng == "": return None + if self.geo_elevation_model is None: return None + # lookup the H3 cell this lat lng occurs in h3_cell = h3.geo_to_h3(float(lat), float(lng), 4) h3_cell_centroid = h3.h3_to_geo(h3_cell) @@ -109,10 +180,11 @@ def geo_model_predict(self, lat, lng, debug=False): def lookup_taxon(self, taxon_id): if taxon_id is None: return None + try: return self.taxonomy.df.loc[taxon_id] except Exception as e: - print(f'taxon `{taxon_id}` does not exist in the taxonomy') + print(f"taxon `{taxon_id}` does not exist in the taxonomy") raise e def predictions_for_image(self, file_path, lat, lng, filter_taxon, score_without_geo=False, @@ -122,11 +194,13 @@ def predictions_for_image(self, file_path, lat, lng, filter_taxon, score_without image = InatInferrer.prepare_image_for_inference(file_path) raw_vision_scores = self.vision_predict(image, debug) raw_geo_scores = self.geo_model_predict(lat, lng, debug) - top100 = self.combine_results(raw_vision_scores, raw_geo_scores, filter_taxon, - score_without_geo, debug) + combined_scores = self.combine_results( + raw_vision_scores, raw_geo_scores, filter_taxon, score_without_geo, debug + ) + combined_scores = self.map_result_synonyms(combined_scores, debug) if debug: print("Prediction Time: %0.2fms" % ((time.time() - start_time) * 1000.)) - return top100 + return combined_scores def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, score_without_geo=False, debug=False): @@ -144,7 +218,7 @@ def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, leaf_scores["geo_score"] = 0 if no_geo_scores else raw_geo_scores # set a lower limit for geo scores if there are any leaf_scores["normalized_geo_score"] = 0 if no_geo_scores \ - else leaf_scores["geo_score"].clip(MINIMUM_GEO_SCORE, None) + else leaf_scores["geo_score"].clip(InatInferrer.MINIMUM_GEO_SCORE, None) # if filtering by a taxon, restrict results to that taxon and its descendants if filter_taxon is not None: @@ -154,7 +228,8 @@ def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, ) # normalize the vision scores so they add up to 1 after filtering sum_of_vision_scores = leaf_scores["vision_score"].sum() - leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] / sum_of_vision_scores + leaf_scores["normalized_vision_score"] = \ + leaf_scores["vision_score"] / sum_of_vision_scores else: # when not filtering by a taxon, the normalized vision score is the same as the original leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] @@ -170,46 +245,96 @@ def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, leaf_scores["normalized_geo_score"] if debug: print("Score Combining Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + leaf_scores.reset_index(drop=True, inplace=True) + return leaf_scores + + def map_result_synonyms(self, leaf_scores, debug=False): + if self.synonyms is None or "has_synonyms" not in leaf_scores.columns: + return leaf_scores + + if debug: + start_time = time.time() + # create an empty dataframe to hold synonym replacements + synonyms_dataframe = pd.DataFrame( + columns=leaf_scores.columns + ).set_index("taxon_id", drop=False) + # loop through the taxa in leaf_scores that have synonym mappings + for taxon in leaf_scores[ + leaf_scores.taxon_id.isin(self.synonyms["model_taxon_id"]) + ].to_dict("records"): + for synonym in self.synonyms[ + self.synonyms["model_taxon_id"] == taxon["taxon_id"] + ].to_dict("records"): + # the taxon has been remove, so there is no replacement + if pd.isna(synonym["taxon_id"]): + continue + + # the replace some attributes of the leaf_scores taxon, while keeping + # all other columns, like the scores, untouched + replacement = taxon.copy() + replacement["parent_taxon_id"] = synonym["parent_taxon_id"] + replacement["taxon_id"] = synonym["taxon_id"] + replacement["rank_level"] = synonym["rank_level"] + replacement["name"] = synonym["name"] + replacement["left"] = np.nan + replacement["right"] = np.nan + # add the replacement taxon to the synonyms dataframe + synonyms_dataframe.loc[replacement["taxon_id"]] = replacement + # remove all taxa from leaf scores that had synonym mappings + leaf_scores = leaf_scores.query("has_synonyms == False") + if not synonyms_dataframe.empty: + # inject the synonym replacements into leaf_scores + leaf_scores = pd.concat([ + leaf_scores.query("has_synonyms == False"), synonyms_dataframe + ], axis=0) + if debug: + print("Synonym Mapping Time: %0.2fms" % ((time.time() - start_time) * 1000.)) return leaf_scores - def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, debug=False): + def aggregate_results(self, leaf_scores, debug=False, + score_ratio_cutoff=0.001, + max_leaf_scores_to_consider=None, + column_for_cutoff="combined_score"): if debug: start_time = time.time() - no_geo_scores = (leaf_scores["geo_score"].max() == 0) - # make a copy of the full taxonomy including non-leaves to be used for storing results - if filter_taxon is not None: - # using nested set left and right values, select the filter taxon, - # its descendants, and its ancestors - all_node_scores = self.taxonomy.df.query( - f'(left >= {filter_taxon["left"]} and right <= {filter_taxon["right"]}) or' + - f'(left < {filter_taxon["left"]} and right > {filter_taxon["right"]})' - ).copy().reset_index(drop=True) - else: - all_node_scores = self.taxonomy.df.copy().reset_index(drop=True) + # start with a copy of the whole taxonomy + all_node_scores = self.taxonomy.df.copy().reset_index(drop=True) - # copy the score columns from the already-calculated leaf scores + # copy columns from the already calculated leaf scores including scores + # and class_id columns which will not be populated for synonyms in the taxonomy all_node_scores = pd.merge(all_node_scores, leaf_scores[[ - "taxon_id", "vision_score", "normalized_vision_score", "geo_score", - "normalized_geo_score"]], on="taxon_id", how="left").set_index("taxon_id", drop=False) - - # calculate the highest combined score + "taxon_id", "vision_score", "normalized_vision_score", "geo_score", "combined_score", + "normalized_geo_score", "leaf_class_id", "iconic_class_id", "spatial_class_id"]], + on="taxon_id", + how="left", + suffixes=["_x", None] + ).set_index("taxon_id", drop=False) + + # calculate the highest combined score from leaf_scores top_combined_score = leaf_scores.sort_values( - "combined_score", ascending=False).head(1)["combined_score"].values[0] - lower_cutoff_threshold = 0.0001 - # determine a lower-bound cutoff where results with combined scores below this cutoff - # will be ignored. This isn't necessary for scoring, but it improves performance - # TODO: evaluate this - cutoff = max([0.00001, top_combined_score * lower_cutoff_threshold]) + column_for_cutoff, ascending=False + ).head(1)[column_for_cutoff].values[0] + # define some cutoff based on a percentage of the top combined score. Taxa with + # scores below the cutoff will be ignored when aggregating scores up the taxonomy + cutoff = top_combined_score * score_ratio_cutoff - aggregated_scores = {} # restrict score aggregation to results where the combined score is above the cutoff - scores_to_aggregate = leaf_scores.query(f'combined_score > {cutoff}') + scores_to_aggregate = leaf_scores.query( + f"{column_for_cutoff} > {cutoff}" + ) + if max_leaf_scores_to_consider is not None: + scores_to_aggregate = scores_to_aggregate.sort_values( + column_for_cutoff, ascending=False + ).head(max_leaf_scores_to_consider) + # loop through all results where the combined score is above the cutoff - for taxon_id, vision_score, geo_score, geo_threshold in zip( + aggregated_scores = {} + for taxon_id, vision_score, geo_score, combined_score, geo_threshold in zip( scores_to_aggregate["taxon_id"], scores_to_aggregate["normalized_vision_score"], - scores_to_aggregate["normalized_geo_score"], + scores_to_aggregate["geo_score"], + scores_to_aggregate["combined_score"], scores_to_aggregate["geo_threshold"] ): # loop through the pre-calculated ancestors of this result's taxon @@ -218,48 +343,39 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, if ancestor_taxon_id not in aggregated_scores: aggregated_scores[ancestor_taxon_id] = {} aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] = 0 - if not no_geo_scores: - aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = 0 - aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 100 - # aggregated vision score is a sum of descendant scores + aggregated_scores[ancestor_taxon_id]["aggregated_combined_score"] = 0 + aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = 0 + aggregated_scores[ancestor_taxon_id][ + "aggregated_geo_threshold" + ] = geo_threshold if (ancestor_taxon_id == taxon_id) else 1.0 + # aggregated vision and combined scores are sums of descendant scores aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] += vision_score - if not no_geo_scores and geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: - # aggregated geo score is the max of descendant geo scores + aggregated_scores[ancestor_taxon_id]["aggregated_combined_score"] += combined_score + + # aggregated geo score is the max of descendant geo scores + if geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = geo_score - if not no_geo_scores and \ - aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] != 0 and \ - geo_score > geo_threshold: - # aggregated geo threshold is set to 0 if any descendants are above their threshold - aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 0 + + # aggregated geo threshold is the min of descendant geo thresholds + if ancestor_taxon_id != taxon_id and geo_threshold < aggregated_scores[ + ancestor_taxon_id + ]["aggregated_geo_threshold"]: + aggregated_scores[ancestor_taxon_id][ + "aggregated_geo_threshold" + ] = geo_threshold # turn the aggregated_scores dict into a data frame scores_df = pd.DataFrame.from_dict(aggregated_scores, orient="index") # merge the aggregated scores into the scoring taxonomy - all_node_scores = all_node_scores.join(scores_df) - - # the aggregated scores of leaves will be NaN, so populate them with their original scores - all_node_scores["aggregated_vision_score"].fillna( - all_node_scores["normalized_vision_score"], inplace=True) - if no_geo_scores: - all_node_scores["aggregated_geo_score"] = 0 - all_node_scores["aggregated_geo_threshold"] = 0 - else: - all_node_scores["aggregated_geo_score"].fillna( - all_node_scores["normalized_geo_score"], inplace=True) - all_node_scores["aggregated_geo_threshold"].fillna( - all_node_scores["geo_threshold"], inplace=True) - - if (no_geo_scores or score_without_geo): - # if there are no geo scores, or it was requested to not use geo scores to affect - # the final combined score, set the combined scores to be the same as the vision scores - all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] - else: - # the combined score is simply the normalized vision score - # multipliedby the normalized geo score - all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] * \ - all_node_scores["aggregated_geo_score"] + all_node_scores = all_node_scores.join(scores_df).query( + "aggregated_combined_score.notnull()" + ) - # calculate a normalized combined score so all values add to 1, to be used for thresholding + # calculate normalized scores so all values add to 1, to be used for thresholding + sum_of_root_node_aggregated_vision_scores = all_node_scores.query( + "parent_taxon_id.isnull()")["aggregated_vision_score"].sum() + all_node_scores["normalized_aggregated_vision_score"] = all_node_scores[ + "aggregated_vision_score"] / sum_of_root_node_aggregated_vision_scores sum_of_root_node_aggregated_combined_scores = all_node_scores.query( "parent_taxon_id.isnull()")["aggregated_combined_score"].sum() all_node_scores["normalized_aggregated_combined_score"] = all_node_scores[ @@ -267,26 +383,19 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, if debug: print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.)) - thresholded_results = all_node_scores.query("normalized_aggregated_combined_score > 0.05") - print("\nTree of aggregated results:") - ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( - lambda row: f'{row.name} [' + - f'V:{round(row.aggregated_vision_score, 4)}, ' + - f'G:{round(row.aggregated_geo_score, 4)}, ' + - f'C:{round(row.aggregated_combined_score, 4)}, ' + - f'NC:{round(row.normalized_aggregated_combined_score, 4)}]')) - print("") + # InatInferrer.print_aggregated_scores(all_node_scores) return all_node_scores - def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, raw_results=False): + def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], + thresholded=False, raw_results=False): if (self.geo_elevation_cells is None) or (self.geo_elevation_model is None): return try: taxon = self.taxonomy.df.loc[taxon_id] except Exception as e: - print(f'taxon `{taxon_id}` does not exist in the taxonomy') + print(f"taxon `{taxon_id}` does not exist in the taxonomy") raise e - if math.isnan(taxon["leaf_class_id"]): + if pd.isna(taxon["leaf_class_id"]): return geo_scores = self.geo_elevation_model.eval_one_class_elevation_from_features( @@ -300,7 +409,7 @@ def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, ra # is smaller. This reduces data needed to be redendered client-side for the Data Layer # mapping approach, and maybe can be removed once switching to map tiles lower_bound_score = np.array([0.0001, taxon["geo_threshold"] / 10]).min() - geo_score_cells = geo_score_cells.query(f'geo_score > {lower_bound_score}') + geo_score_cells = geo_score_cells.query(f"geo_score > {lower_bound_score}") if bounds: min = geo_score_cells["geo_score"].min() @@ -316,7 +425,7 @@ def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, ra return dict(zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"])) def h3_04_taxon_range(self, taxon_id, bounds=[]): - taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f'{taxon_id}.csv') + taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f"{taxon_id}.csv") if not os.path.exists(taxon_range_path): return None taxon_range_df = pd.read_csv(taxon_range_path, names=["h3_04"], header=None). \ @@ -328,7 +437,9 @@ def h3_04_taxon_range(self, taxon_id, bounds=[]): return dict(zip(taxon_range_df.index.astype(str), taxon_range_df["value"])) def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): - geomodel_results = self.h3_04_geo_results_for_taxon(taxon_id, bounds, thresholded=True) or {} + geomodel_results = self.h3_04_geo_results_for_taxon( + taxon_id, bounds, thresholded=True + ) or {} taxon_range_results = self.h3_04_taxon_range(taxon_id, bounds) or {} combined_results = {} for cell_key in geomodel_results: @@ -353,6 +464,37 @@ def h3_04_bounds(self, taxon_id): "nelng": geomodel_results["lng"].max() } + def common_ancestor_from_leaf_scores( + self, leaf_scores, debug=False, score_to_use="combined_score" + ): + aggregated_scores = self.aggregate_results( + leaf_scores, + debug=debug, + score_ratio_cutoff=InatInferrer.COMMON_ANCESTOR_CUTOFF_RATIO, + max_leaf_scores_to_consider=InatInferrer.COMMON_ANCESTOR_WINDOW, + column_for_cutoff=score_to_use + ) + return self.common_ancestor_from_aggregated_scores( + aggregated_scores, + debug=debug, + score_to_use=score_to_use + ) + + def common_ancestor_from_aggregated_scores( + self, aggregated_scores, debug=False, score_to_use="combined_score" + ): + aggregated_score_to_use = "normalized_aggregated_vision_score" if \ + score_to_use == "vision_score" else "normalized_aggregated_combined_score" + common_ancestor_candidates = aggregated_scores.query( + f"{aggregated_score_to_use} > 0.78 and rank_level >= 20 and rank_level <= 33" + ).sort_values( + by=["rank_level"] + ) + if common_ancestor_candidates.empty: + return None + + return common_ancestor_candidates.iloc[0] + @staticmethod def prepare_image_for_inference(file_path): mime_type = magic.from_file(file_path, mime=True) @@ -395,7 +537,25 @@ def filter_geo_dataframe_by_bounds(geo_df, bounds): # query for cells wtihin the buffered bounds, and potentially # on the other side of the antimeridian - query = f'lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and ' + \ - f' ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})' + \ - f' {antimedirian_condition})' + query = f"lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and " + \ + f" ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})" + \ + f" {antimedirian_condition})" return geo_df.query(query) + + @staticmethod + def print_aggregated_scores(aggregated_scores): + thresholded_results = aggregated_scores.query( + "normalized_aggregated_combined_score > 0.005" + ) + print("\nTree of aggregated results:") + ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( + lambda row: f"{row.name} [" + f"ID:{row.taxon_id}, " + f"V:{round(row.aggregated_vision_score, 4)}, " + f"NV:{round(row.normalized_aggregated_vision_score, 4)}, " + f"G:{round(row.aggregated_geo_score, 4)}, " + f"GT:{round(row.aggregated_geo_threshold, 4)}, " + f"C:{round(row.aggregated_combined_score, 4)}, " + f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" + )) + print("") diff --git a/lib/inat_vision_api.py b/lib/inat_vision_api.py index 8703c43..35b0548 100644 --- a/lib/inat_vision_api.py +++ b/lib/inat_vision_api.py @@ -8,6 +8,7 @@ from flask import Flask, request, render_template from web_forms import ImageForm from inat_inferrer import InatInferrer +from inat_vision_api_responses import InatVisionAPIResponses class InatVisionAPI: @@ -57,7 +58,7 @@ def h3_04_default_route(self, h3_04_method): else: results_dict = h3_04_method(taxon_id, bounds) if results_dict is None: - return f'Unknown taxon_id {taxon_id}', 422 + return f"Unknown taxon_id {taxon_id}", 422 return InatVisionAPI.round_floats(results_dict, 8) def h3_04_bounds_route(self): @@ -67,7 +68,7 @@ def h3_04_bounds_route(self): results_dict = self.inferrer.h3_04_bounds(taxon_id) if results_dict is None: - return f'Unknown taxon_id {taxon_id}', 422 + return f"Unknown taxon_id {taxon_id}", 422 return results_dict def index_route(self): @@ -111,79 +112,25 @@ def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): score_without_geo = (form.score_without_geo.data == "true") filter_taxon = self.inferrer.lookup_taxon(iconic_taxon_id) leaf_scores = self.inferrer.predictions_for_image( - file_path, lat, lng, filter_taxon, score_without_geo + file_path, lat, lng, filter_taxon, score_without_geo, debug=True ) if form.aggregated.data == "true": - aggregated_results = self.inferrer.aggregate_results(leaf_scores, filter_taxon, - score_without_geo) - columns_to_return = [ - "aggregated_combined_score", - "aggregated_geo_score", - "taxon_id", - "name", - "aggregated_vision_score", - "aggregated_geo_threshold" - ] - column_mapping = { - "taxon_id": "id", - "aggregated_combined_score": "combined_score", - "aggregated_geo_score": "geo_score", - "aggregated_vision_score": "vision_score", - "aggregated_geo_threshold": "geo_threshold" - } + aggregated_scores = self.inferrer.aggregate_results(leaf_scores, debug=True) + if form.format.data == "tree": + return InatVisionAPIResponses.aggregated_tree_response(aggregated_scores) + return InatVisionAPIResponses.aggregated_object_response( + leaf_scores, aggregated_scores, self.inferrer + ) - no_geo_scores = (leaf_scores["geo_score"].max() == 0) + # legacy dict response + if geomodel != "true": + return InatVisionAPIResponses.legacy_dictionary_response(leaf_scores) - # set a cutoff where branches whose combined scores are below the threshold are ignored - # TODO: this threshold is completely arbitrary and needs testing - aggregated_results = aggregated_results.query("normalized_aggregated_combined_score > 0.05") + if form.format.data == "object": + return InatVisionAPIResponses.object_response(leaf_scores, self.inferrer) - # after setting a cutoff, get the parent IDs of the remaining taxa - parent_taxon_ids = aggregated_results["parent_taxon_id"].values # noqa: F841 - # the leaves of the pruned taxonomy (not leaves of the original taxonomy), are the - # taxa who are not parents of any remaining taxa - leaf_results = aggregated_results.query("taxon_id not in @parent_taxon_ids") - - leaf_results = leaf_results.sort_values("aggregated_combined_score", ascending=False).head(100) - score_columns = ["aggregated_combined_score", "aggregated_geo_score", - "aggregated_vision_score", "aggregated_geo_threshold"] - leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index") - final_results = leaf_results[columns_to_return].rename(columns=column_mapping) - else: - no_geo_scores = (leaf_scores["geo_score"].max() == 0) - top_combined_score = leaf_scores.sort_values("combined_score", ascending=False).head(1)["combined_score"].values[0] - # set a cutoff so results whose combined scores are - # much lower than the best score are not returned - leaf_scores = leaf_scores.query(f'combined_score > {top_combined_score * 0.001}') - - top100 = leaf_scores.sort_values("combined_score", ascending=False).head(100) - score_columns = ["combined_score", "geo_score", "normalized_vision_score", "geo_threshold"] - top100[score_columns] = top100[score_columns].multiply(100, axis="index") - - # legacy dict response - if geomodel != "true": - top_taxon_combined_scores = top100[ - ["taxon_id", "combined_score"] - ].to_dict(orient="records") - return {x["taxon_id"]: x["combined_score"] for x in top_taxon_combined_scores} - - # new array response - columns_to_return = [ - "combined_score", - "geo_score", - "taxon_id", - "name", - "normalized_vision_score", - "geo_threshold" - ] - column_mapping = { - "taxon_id": "id", - "normalized_vision_score": "vision_score" - } - final_results = top100[columns_to_return].rename(columns=column_mapping) - - return final_results.to_dict(orient="records") + return InatVisionAPIResponses.array_response(leaf_scores) def process_upload(self, form_image_data, image_uuid): if form_image_data is None: @@ -208,7 +155,10 @@ def download_observation(self, observation_id, image_uuid): if not os.path.exists(cache_path): urllib.request.urlretrieve( data["results"][0]["photos"][0]["url"].replace("square", "medium"), cache_path) - latlng = data["results"][0]["location"].split(",") + if data["results"][0]["location"] is None: + latlng = [None, None] + else: + latlng = data["results"][0]["location"].split(",") # return the path to the cached image, coordinates, and iconic taxon return cache_path, latlng[0], latlng[1], data["results"][0]["taxon"]["iconic_taxon_id"] @@ -222,7 +172,7 @@ def valid_leaf_taxon_id_for_request(self, request): taxon_id = int(taxon_id) if float(taxon_id) not in self.inferrer.taxonomy.leaf_df["taxon_id"].values: - return None, f'Unknown taxon_id {taxon_id}', 422 + return None, f"Unknown taxon_id {taxon_id}", 422 return taxon_id, None, None def valid_bounds_for_request(self, request): @@ -242,12 +192,12 @@ def valid_bounds_for_request(self, request): def write_logstash(image_uuid, file_path, request_start_datetime, request_start_time): request_end_time = time.time() request_time = round((request_end_time - request_start_time) * 1000, 6) - logstash_log = open('log/logstash.log', 'a') - log_data = {'@timestamp': request_start_datetime.isoformat(), - 'uuid': image_uuid, - 'duration': request_time, - 'client_ip': request.access_route[0], - 'image_size': os.path.getsize(file_path)} + logstash_log = open("log/logstash.log", "a") + log_data = {"@timestamp": request_start_datetime.isoformat(), + "uuid": image_uuid, + "duration": request_time, + "client_ip": request.access_route[0], + "image_size": os.path.getsize(file_path)} json.dump(log_data, logstash_log) logstash_log.write("\n") logstash_log.close() diff --git a/lib/inat_vision_api_responses.py b/lib/inat_vision_api_responses.py new file mode 100644 index 0000000..c71155f --- /dev/null +++ b/lib/inat_vision_api_responses.py @@ -0,0 +1,217 @@ +import numpy as np +import pandas as pd +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe + + +class InatVisionAPIResponses: + @staticmethod + def legacy_dictionary_response(leaf_scores): + leaf_scores = InatVisionAPIResponses.limit_leaf_scores_for_response(leaf_scores) + leaf_scores = InatVisionAPIResponses.update_leaf_scores_scaling(leaf_scores) + top_taxon_combined_scores = leaf_scores[ + ["taxon_id", "combined_score"] + ].to_dict(orient="records") + return {x["taxon_id"]: x["combined_score"] for x in top_taxon_combined_scores} + + @staticmethod + def array_response(leaf_scores): + leaf_scores = InatVisionAPIResponses.limit_leaf_scores_for_response(leaf_scores) + leaf_scores = InatVisionAPIResponses.update_leaf_scores_scaling(leaf_scores) + return InatVisionAPIResponses.array_response_columns(leaf_scores).to_dict(orient="records") + + @staticmethod + def object_response(leaf_scores, inferrer): + leaf_scores = InatVisionAPIResponses.limit_leaf_scores_for_response(leaf_scores) + leaf_scores = InatVisionAPIResponses.update_leaf_scores_scaling(leaf_scores) + results = InatVisionAPIResponses.array_response_columns( + leaf_scores + ).to_dict(orient="records") + common_ancestor = inferrer.common_ancestor_from_leaf_scores(leaf_scores, debug=True) + if common_ancestor is not None: + common_ancestor_frame = pd.DataFrame([common_ancestor]) + common_ancestor_frame = InatVisionAPIResponses.update_aggregated_scores_scaling( + common_ancestor_frame + ) + common_ancestor = InatVisionAPIResponses.array_response_common_ancestor_columns( + common_ancestor_frame + ).to_dict(orient="records")[0] + + return { + "common_ancestor": common_ancestor, + "results": results + } + + @staticmethod + def aggregated_tree_response(aggregated_scores): + top_leaf_combined_score = aggregated_scores.query( + "leaf_class_id.notnull()" + ).sort_values( + "normalized_aggregated_combined_score", + ascending=False + ).head(1)["normalized_aggregated_combined_score"].values[0] + # set a cutoff so results whose combined scores are + # much lower than the best score are not returned + aggregated_scores = aggregated_scores.query( + f"normalized_aggregated_combined_score > {top_leaf_combined_score * 0.001}" + ) + + printable_tree = ModelTaxonomyDataframe.printable_tree( + aggregated_scores, + display_taxon_lambda=( + lambda row: f"{row.name}\t\t[" + f"ID:{row.taxon_id}, " + f"V:{round(row.aggregated_vision_score, 4)}, " + f"NV:{round(row.normalized_aggregated_vision_score, 4)}, " + f"G:{round(row.aggregated_geo_score, 4)}, " + f"C:{round(row.aggregated_combined_score, 4)}, " + f"NC:{round(row.normalized_aggregated_combined_score, 4)}]" + ) + ) + return "
" + "" + + @staticmethod + def aggregated_object_response(leaf_scores, aggregated_scores, inferrer): + top_leaf_combined_score = aggregated_scores.query( + "leaf_class_id.notnull()" + ).sort_values( + "normalized_aggregated_combined_score", + ascending=False + ).head(1)["normalized_aggregated_combined_score"].values[0] + + top_100_leaves = aggregated_scores.query( + "leaf_class_id.notnull() and " + f"normalized_aggregated_combined_score > {top_leaf_combined_score * 0.001}" + ).sort_values( + "normalized_aggregated_combined_score", + ascending=False + ).head(100) + + common_ancestor = inferrer.common_ancestor_from_leaf_scores(leaf_scores, debug=True) + aggregated_scores = InatVisionAPIResponses.update_aggregated_scores_scaling( + aggregated_scores + ) + + filter_taxa = np.array([]) + for leaf_taxon_id in top_100_leaves["taxon_id"].to_numpy(dtype=int): + filter_taxa = np.append(filter_taxa, leaf_taxon_id) + filter_taxa = np.append(filter_taxa, + inferrer.taxonomy.taxon_ancestors[leaf_taxon_id]) + top_100_and_ancestors = aggregated_scores[aggregated_scores["taxon_id"].isin(filter_taxa)] + + final_results = InatVisionAPIResponses.aggregated_scores_response_columns( + top_100_and_ancestors + ) + + if common_ancestor is not None: + common_ancestor_frame = pd.DataFrame([common_ancestor]) + common_ancestor_frame = InatVisionAPIResponses.update_aggregated_scores_scaling( + common_ancestor_frame + ) + common_ancestor = InatVisionAPIResponses.aggregated_scores_response_columns( + common_ancestor_frame + ).to_dict(orient="records")[0] + + return { + "common_ancestor": common_ancestor, + "results": final_results.to_dict(orient="records") + } + + @staticmethod + def limit_leaf_scores_for_response(leaf_scores): + top_combined_score = leaf_scores.sort_values( + "combined_score", + ascending=False + ).head(1)["combined_score"].values[0] + # set a cutoff so results whose combined scores are + # much lower than the best score are not returned + leaf_scores = leaf_scores.query(f"combined_score > {top_combined_score * 0.001}") + return leaf_scores.sort_values("combined_score", ascending=False).head(100) + + @staticmethod + def update_leaf_scores_scaling(leaf_scores): + score_columns = [ + "combined_score", + "geo_score", + "normalized_vision_score", + "geo_threshold" + ] + leaf_scores[score_columns] = leaf_scores[ + score_columns + ].multiply(100, axis="index") + return leaf_scores + + @staticmethod + def update_aggregated_scores_scaling(aggregated_scores): + score_columns = [ + "aggregated_combined_score", + "normalized_aggregated_combined_score", + "aggregated_geo_score", + "aggregated_vision_score", + "aggregated_geo_threshold" + ] + aggregated_scores[score_columns] = aggregated_scores[ + score_columns + ].multiply(100, axis="index") + return aggregated_scores + + @staticmethod + def array_response_columns(leaf_scores): + columns_to_return = [ + "combined_score", + "geo_score", + "taxon_id", + "name", + "normalized_vision_score", + "geo_threshold" + ] + column_mapping = { + "taxon_id": "id", + "normalized_vision_score": "vision_score" + } + return leaf_scores[columns_to_return].rename(columns=column_mapping) + + @staticmethod + def array_response_common_ancestor_columns(common_ancestor_dataframe): + columns_to_return = [ + "aggregated_combined_score", + "aggregated_geo_score", + "taxon_id", + "name", + "normalized_aggregated_vision_score", + "aggregated_geo_threshold" + ] + column_mapping = { + "aggregated_combined_score": "combined_score", + "aggregated_geo_score": "geo_score", + "taxon_id": "id", + "normalized_aggregated_vision_score": "vision_score", + "aggregated_geo_threshold": "geo_threshold" + } + return common_ancestor_dataframe[columns_to_return].rename(columns=column_mapping) + + @staticmethod + def aggregated_scores_response_columns(aggregated_scores): + columns_to_return = [ + "aggregated_combined_score", + "normalized_aggregated_combined_score", + "aggregated_geo_score", + "taxon_id", + "parent_taxon_id", + "name", + "rank_level", + "left", + "right", + "depth", + "aggregated_vision_score", + "aggregated_geo_threshold", + ] + column_mapping = { + "taxon_id": "id", + "parent_taxon_id": "parent_id", + "aggregated_combined_score": "combined_score", + "normalized_aggregated_combined_score": "normalized_combined_score", + "aggregated_geo_score": "geo_score", + "aggregated_vision_score": "vision_score", + "aggregated_geo_threshold": "geo_threshold" + } + return aggregated_scores[columns_to_return].rename(columns=column_mapping) diff --git a/lib/model_taxonomy.py b/lib/model_taxonomy.py deleted file mode 100644 index d390516..0000000 --- a/lib/model_taxonomy.py +++ /dev/null @@ -1,73 +0,0 @@ -import csv -from lib.taxon import Taxon - - -class ModelTaxonomy: - - def __init__(self, path): - self.load_mapping(path) - self.assign_nested_values() - - def load_mapping(self, path): - self.node_key_to_leaf_class_id = {} - self.leaf_class_to_taxon = {} - # there is no taxon with ID 0, but roots of the taxonomy have a parent ID of 0, - # so create a fake taxon of Life to represent the root of the entire tree - self.taxa = {0: Taxon({"name": "Life", "depth": 0})} - self.taxon_children = {} - try: - with open(path) as csv_file: - csv_reader = csv.DictReader(csv_file, delimiter=",") - for row in csv_reader: - taxon_id = int(row["taxon_id"]) - rank_level = float(row["rank_level"]) - leaf_class_id = int(row["leaf_class_id"]) if row["leaf_class_id"] else None - parent_id = int(row["parent_taxon_id"]) if row["parent_taxon_id"] else 0 - # some taxa are not leaves and aren't represented in the leaf layer - if leaf_class_id is not None: - self.node_key_to_leaf_class_id[taxon_id] = leaf_class_id - self.leaf_class_to_taxon[leaf_class_id] = taxon_id - self.taxa[taxon_id] = Taxon({ - "id": taxon_id, - "name": row["name"], - "parent_id": parent_id, - "leaf_class_id": leaf_class_id, - "rank_level": rank_level - }) - if parent_id not in self.taxon_children: - self.taxon_children[parent_id] = [] - self.taxon_children[parent_id].append(taxon_id) - except IOError as e: - print(e) - print(f"\n\nCannot open mapping file `{path}`\n\n") - raise e - - # prints to the console a representation of this tree - def print(self, taxon_id=0, ancestor_prefix=""): - children = self.taxon_children[taxon_id] - index = 0 - for child_id in children: - last_in_branch = (index == len(children) - 1) - index += 1 - icon = "└──" if last_in_branch else "├──" - prefixIcon = " " if last_in_branch else "│ " - taxon = self.taxa[child_id] - print(f'{ancestor_prefix}{icon}{taxon.name} :: {taxon.left}:{taxon.right}') - if child_id in self.taxon_children: - self.print(child_id, f"{ancestor_prefix}{prefixIcon}") - - # calculated nested set left and right values and depth representing how many nodes - # down the taxon is from Life. These can be later used for an efficient way to calculate - # if a taxon is a descendant of another - def assign_nested_values(self, taxon_id=0, index=0, depth=1, ancestors=[]): - for child_id in self.taxon_children[taxon_id]: - self.taxa[child_id].set("left", index) - self.taxa[child_id].set("depth", depth) - self.taxa[child_id].set("ancestors", ancestors) - index += 1 - if child_id in self.taxon_children: - child_ancestors = ancestors + [child_id] - index = self.assign_nested_values(child_id, index, depth + 1, child_ancestors) - self.taxa[child_id].set("right", index) - index += 1 - return index diff --git a/lib/model_taxonomy_dataframe.py b/lib/model_taxonomy_dataframe.py index a707c7d..54d0ebc 100644 --- a/lib/model_taxonomy_dataframe.py +++ b/lib/model_taxonomy_dataframe.py @@ -1,4 +1,3 @@ -import math import pandas as pd @@ -8,7 +7,27 @@ def __init__(self, path, thresholds_path): self.load_mapping(path, thresholds_path) def load_mapping(self, path, thresholds_path): - self.df = pd.read_csv(path) + self.df = pd.read_csv( + path, + usecols=[ + "parent_taxon_id", + "taxon_id", + "rank_level", + "leaf_class_id", + "iconic_class_id", + "spatial_class_id", + "name" + ], + dtype={ + "parent_taxon_id": "Int64", + "taxon_id": int, + "rank_level": float, + "leaf_class_id": "Int64", + "iconic_class_id": "Int64", + "spatial_class_id": "Int64", + "name": pd.StringDtype() + } + ) # left and right will be used to store nested set indices self.df["left"] = pd.Series([], dtype=object) self.df["right"] = pd.Series([], dtype=object) @@ -17,7 +36,7 @@ def load_mapping(self, path, thresholds_path): self.taxon_ancestors = {} for index, taxon in self.df.iterrows(): self.taxon_row_mapping[taxon["taxon_id"]] = index - parent_id = 0 if math.isnan(taxon["parent_taxon_id"]) else int(taxon["parent_taxon_id"]) + parent_id = 0 if pd.isna(taxon["parent_taxon_id"]) else int(taxon["parent_taxon_id"]) if parent_id not in self.taxon_children: self.taxon_children[parent_id] = [] self.taxon_children[parent_id].append(taxon["taxon_id"]) @@ -34,14 +53,17 @@ def load_mapping(self, path, thresholds_path): # calculate nested set left and right values. These can be later used for an efficient # way to calculate if a taxon is an ancestor or descendant of another - def assign_nested_values(self, taxon_id=0, index=0, ancestor_taxon_ids=[]): + def assign_nested_values(self, taxon_id=0, index=0, depth=0, ancestor_taxon_ids=[]): for child_id in self.taxon_children[taxon_id]: self.df.at[self.taxon_row_mapping[child_id], "left"] = index - self.taxon_ancestors[child_id] = ancestor_taxon_ids + self.df.at[self.taxon_row_mapping[child_id], "depth"] = depth + child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id] + self.taxon_ancestors[child_id] = child_ancestor_taxon_ids index += 1 if child_id in self.taxon_children: - child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id] - index = self.assign_nested_values(child_id, index, child_ancestor_taxon_ids) + index = self.assign_nested_values( + child_id, index, depth + 1, child_ancestor_taxon_ids + ) self.df.at[self.taxon_row_mapping[child_id], "right"] = index index += 1 return index @@ -50,25 +72,39 @@ def assign_nested_values(self, taxon_id=0, index=0, ancestor_taxon_ids=[]): def children(df, taxon_id): if taxon_id == 0: return df.query("parent_taxon_id.isnull()") - return df.query(f'parent_taxon_id == {taxon_id}') + return df.query(f"parent_taxon_id == {taxon_id}") @staticmethod def print(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): + print("\n".join(ModelTaxonomyDataframe.printable_tree( + df, taxon_id, ancestor_prefix, display_taxon_lambda + ))) + + @staticmethod + def printable_tree(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): children = ModelTaxonomyDataframe.children(df, taxon_id) index = 0 if "aggregated_combined_score" in children: children = children.sort_values("aggregated_combined_score", ascending=False) else: children = children.sort_values("name") + linesToPrint = [] for row in children.itertuples(): last_in_branch = (index == len(children) - 1) index += 1 icon = "└──" if last_in_branch else "├──" prefixIcon = " " if last_in_branch else "│ " - print(f'{ancestor_prefix}{icon}', end="") + lineToPrint = f"{ancestor_prefix}{icon}" if display_taxon_lambda is None: - print(f'{row.name} :: {row.left}:{row.right}') + lineToPrint += f"{row.name} :: {row.left}:{row.right}" else: - print(display_taxon_lambda(row)) + lineToPrint += display_taxon_lambda(row) + linesToPrint.append(lineToPrint) if row.right != row.left + 1: - ModelTaxonomyDataframe.print(df, row.taxon_id, f"{ancestor_prefix}{prefixIcon}", display_taxon_lambda) + linesToPrint += ModelTaxonomyDataframe.printable_tree( + df, + row.taxon_id, + f"{ancestor_prefix}{prefixIcon}", + display_taxon_lambda + ) + return linesToPrint diff --git a/lib/model_test_data_export_manager.py b/lib/model_test_data_export_manager.py index 4c3ee2e..3aa35f9 100644 --- a/lib/model_test_data_export_manager.py +++ b/lib/model_test_data_export_manager.py @@ -21,9 +21,9 @@ def load_train_data_photo_ids(self): def export_path(self, filename_addition): currentDatetime = datetime.now() timestamp = currentDatetime.strftime("%Y%m%d") - export_path = f'test-obs-{timestamp}' + export_path = f"test-obs-{timestamp}" if filename_addition: - export_path += f'-{filename_addition}' + export_path += f"-{filename_addition}" if "filename_suffix" in self.cmd_args and self.cmd_args["filename_suffix"]: export_path += "-" + self.cmd_args["filename_suffix"] export_path += ".csv" @@ -38,7 +38,7 @@ async def generate_from_cmd_args(self): parameters_string = None if api_parameters: - parameters_string = "-".join(map(lambda key: f'{key}-{api_parameters[key]}', + parameters_string = "-".join(map(lambda key: f"{key}-{api_parameters[key]}", api_parameters)) export_path = self.export_path(parameters_string) exporter = ModelTestDataExporter( diff --git a/lib/model_test_data_exporter.py b/lib/model_test_data_exporter.py index 5013bd7..8fd2639 100644 --- a/lib/model_test_data_exporter.py +++ b/lib/model_test_data_exporter.py @@ -89,15 +89,15 @@ async def generate_test_data(self): await self.fetch_more_data() async def fetch_more_data(self): - self.queue = asyncio.Queue(ModelTestDataExporter.N_WORKERS) + self.queue = asyncio.Queue() self.workers = [asyncio.create_task(self.worker_task()) for _ in range(ModelTestDataExporter.N_WORKERS)] min_pages_remaining = math.ceil( (self.max_results / ModelTestDataExporter.API_REQUEST_PER_PAGE) ) - print(f'Queueing {min_pages_remaining} workers') + print(f"Queueing {min_pages_remaining} workers") for i in range(min_pages_remaining): - await self.queue.put(i) + self.queue.put_nowait(i) await self.queue.join() for worker in self.workers: worker.cancel() @@ -110,7 +110,7 @@ async def process_api_response(self): if self.finished(): return - print(f'Fetching more results... {self.rows_written} so far') + print(f"Fetching more results... {self.rows_written} so far") starting_rows_written = self.rows_written async with self.session.get(ModelTestDataExporter.API_BASE_URL, params=self.api_parameters) as response: @@ -145,6 +145,7 @@ def process_api_response_row(self, row): metric_counts[metric["metric"]] -= 1 if ("location" in metric_counts and metric_counts["location"] < 0) \ or ("evidence" in metric_counts and metric_counts["evidence"] < 0) \ + or ("subject" in metric_counts and metric_counts["subject"] < 0) \ or ("date" in metric_counts and metric_counts["date"] < 0) \ or ("recent" in metric_counts and metric_counts["recent"] < 0): self.used_observations[row["uuid"]] = True @@ -158,7 +159,8 @@ def process_api_response_row(self, row): self.used_observations[row["uuid"]] = True return - if row["quality_grade"] == "casual" and not (row["community_taxon_id"] and row["community_taxon_id"] == row["taxon"]["id"]): + if row["quality_grade"] == "casual" \ + and not (row["community_taxon_id"] and row["community_taxon_id"] == row["taxon"]["id"]): self.used_observations[row["uuid"]] = True return diff --git a/lib/pt_geo_prior_model.py b/lib/pt_geo_prior_model.py index b81b4e9..77ea3b8 100644 --- a/lib/pt_geo_prior_model.py +++ b/lib/pt_geo_prior_model.py @@ -31,7 +31,7 @@ def predict(self, latitude, longitude, filter_taxon_id=None): try: filter_taxon = self.taxonomy.df.iloc[filter_taxon_id] except Exception as e: - print(f'filter_taxon `{filter_taxon_id}` does not exist in the taxonomy') + print(f"filter_taxon `{filter_taxon_id}` does not exist in the taxonomy") raise e location = np.array([longitude, latitude])[np.newaxis, ...] # we're not currently using date inference, so set default values for date diff --git a/lib/taxon.py b/lib/taxon.py deleted file mode 100644 index 63e169a..0000000 --- a/lib/taxon.py +++ /dev/null @@ -1,30 +0,0 @@ -# Taxon: -# parent_taxon_id -# taxon_id -# rank_level -# leaf_class_id -# iconic_class_id -# name -# left -# right -# depth - - -class Taxon: - - def __init__(self, row): - for key in row: - self.set(key, row[key]) - - def set(self, attr, val): - setattr(self, attr, val) - - def is_or_descendant_of(self, taxon): - if self.id == taxon.id: - return True - return self.descendant_of(taxon) - - # using the nested set left and right values, a taxon is a descendant of another - # as long as its left is higher and its right is lower - def descendant_of(self, taxon): - return self.left > taxon.left and self.right < taxon.right diff --git a/lib/templates/home.html b/lib/templates/home.html index 37244e9..34b87b8 100644 --- a/lib/templates/home.html +++ b/lib/templates/home.html @@ -23,18 +23,21 @@
".join(printable_tree) + "