diff --git a/export_data.py b/export_data.py new file mode 100644 index 0000000..96559b1 --- /dev/null +++ b/export_data.py @@ -0,0 +1,30 @@ +import click +import json + + +@click.command() +@click.option("--exclude-train-photos-path", "-x", multiple=True, + help="Exclude photos that were included in these training sets.") +@click.option("--limit", type=int, show_default=True, default=5000, + help="Number of observations to include.") +@click.option("--standard_set", help="Export the standard set of 18 filtered datasets.") +@click.option("--filename_suffix", type=str, help="String to add to end of filename.") +@click.option("--place_id", type=int, help="Export observations in this place.") +@click.option("--taxon_id", type=int, help="Export observations in this taxon.") +def test(**args): + # some libraries are slow to import, so wait until command is validated and properly invoked + from lib.model_test_data_exporter import ModelTestDataExporter + print("\nArguments:") + print(json.dumps(args, indent=4)) + print("\nInitializing ModelTestDataExporter...\n") + model_test_data_exporter = ModelTestDataExporter(**args) + print("Exporting data...\n") + if "standard_set" in args and args["standard_set"]: + model_test_data_exporter.generate_standard_set() + else: + model_test_data_exporter.generate_from_cmd_args() + print("\nDone\n") + + +if __name__ == "__main__": + test() diff --git a/generate_thresholds.py b/generate_thresholds.py index 2d3bad4..3f0bd14 100644 --- a/generate_thresholds.py +++ b/generate_thresholds.py @@ -3,229 +3,347 @@ """ import argparse -from tqdm.auto import tqdm +import tifffile +import os import pandas as pd -import h3pandas import numpy as np +import h3 +import h3pandas +import tensorflow as tf +import csv import math +import json +from tqdm.auto import tqdm +import tensorflow as tf from sklearn.metrics import precision_recall_curve -from sklearn.metrics import auc -import sys +import matplotlib.pyplot as plt +import warnings -def ratios_for_taxon_id(taxon_id, all_spatial_grid_counts, train_df_h3): - - #counts in each presence cell for target taxon - target_spatial_grid_counts = train_df_h3[train_df_h3.taxon_id==taxon_id].index.value_counts() +class ResLayer(tf.keras.layers.Layer): + def __init__(self): + super(ResLayer, self).__init__() + self.w1 = tf.keras.layers.Dense( + 256, activation="relu", kernel_initializer="he_normal" + ) + self.w2 = tf.keras.layers.Dense( + 256, activation="relu", kernel_initializer="he_normal" + ) + self.dropout = tf.keras.layers.Dropout(rate=0.5) + self.add = tf.keras.layers.Add() + + def call(self, inputs): + x = self.w1(inputs) + x = self.dropout(x) + x = self.w2(x) + x = self.add([x, inputs]) + return x + + def get_config(self): + return {} + +class Taxon: + + def __init__(self, row): + for key in row: + setattr(self, key, row[key]) + + def set(self, attr, val): + setattr(self, attr, val) + + def is_or_descendant_of(self, taxon): + if self.id == taxon.id: + return True + return self.descendant_of(taxon) + + # using the nested set left and right values, a taxon is a descendant of another + # as long as its left is higher and its right is lower + def descendant_of(self, taxon): + return self.left > taxon.left and self.right < taxon.right + +class ModelTaxonomy: + + def __init__(self, path): + self.load_mapping(path) + self.assign_nested_values() + + def load_mapping(self, path): + self.node_key_to_leaf_class_id = {} + self.leaf_class_to_taxon = {} + # there is no taxon with ID 0, but roots of the taxonomy with have a parent ID of 0, + # so create a fake taxon of Life to represent the root of the entire tree + self.taxa = {0: Taxon({"name": "Life", "depth": 0})} + self.taxon_children = {} + try: + with open(path) as csv_file: + csv_reader = csv.DictReader(csv_file, delimiter=",") + for row in csv_reader: + taxon_id = int(row["taxon_id"]) + rank_level = float(row["rank_level"]) + leaf_class_id = int(row["leaf_class_id"]) if row["leaf_class_id"] else None + parent_id = int(row["parent_taxon_id"]) if row["parent_taxon_id"] else 0 + # some taxa are not leaves and aren't represented in the leaf layer + if leaf_class_id is not None: + self.node_key_to_leaf_class_id[taxon_id] = leaf_class_id + self.leaf_class_to_taxon[leaf_class_id] = taxon_id + self.taxa[taxon_id] = Taxon({ + "id": taxon_id, + "name": row["name"], + "parent_id": parent_id, + "leaf_class_id": leaf_class_id, + "rank_level": rank_level + }) + if parent_id not in self.taxon_children: + self.taxon_children[parent_id] = [] + self.taxon_children[parent_id].append(taxon_id) + except IOError as e: + print(e) + print(f"\n\nCannot open mapping file `{path}`\n\n") + raise e + + # prints to the console a representation of this tree + def print(self, taxon_id=0, ancestor_prefix=""): + children = self.taxon_children[taxon_id] + index = 0 + for child_id in children: + last_in_branch = (index == len(children) - 1) + index += 1 + icon = "└──" if last_in_branch else "├──" + prefixIcon = " " if last_in_branch else "│ " + taxon = self.taxa[child_id] + print(f'{ancestor_prefix}{icon}{taxon.name} :: {taxon.left}:{taxon.right}') + if child_id in self.taxon_children: + self.print(child_id, f"{ancestor_prefix}{prefixIcon}") + + # calculated nested set left and right values and depth representing how many nodes + # down the taxon is from Life. These can be later used for an efficient way to calculate + # if a taxon is a descendant of another + def assign_nested_values(self, taxon_id=0, index=0, depth=1, ancestors=[]): + for child_id in self.taxon_children[taxon_id]: + self.taxa[child_id].set("left", index) + self.taxa[child_id].set("depth", depth) + self.taxa[child_id].set("ancestors", ancestors) + index += 1 + if child_id in self.taxon_children: + child_ancestors = ancestors + [child_id] + index = self.assign_nested_values(child_id, index, depth + 1, child_ancestors) + self.taxa[child_id].set("right", index) + index += 1 + return index + + +class TFGeoPriorModelEnv: + + def __init__(self, model, taxonomy): + self.taxonomy = taxonomy + # initialize the geo model for inference + self.gpmodel = tf.keras.models.load_model( + model, + custom_objects={'ResLayer': ResLayer}, + compile=False + ) - #background count per target taxon count in target taxon presence cells - counts = pd.DataFrame({ - "target": target_spatial_grid_counts, - "all": all_spatial_grid_counts, - }) - counts = counts.fillna(0) - counts["target_ratio"] = counts["all"]/counts["target"] - counts.replace([np.inf, -np.inf], np.nan, inplace=True) - good_counts = counts["target_ratio"].dropna() - return good_counts - -def pseudo_absences_for_taxon_id( - taxon_id, - all_spatial_grid_counts, - train_df_h3, - test_df_h3, - full_spatial_data_lookup_table - ): - - #count cutoff is mean background count per target taxon count in target taxon presence cells - count_cutoff = ratios_for_taxon_id(taxon_id, all_spatial_grid_counts, train_df_h3).mean() - cutoff_grid_cell_indices = set(all_spatial_grid_counts[all_spatial_grid_counts>count_cutoff].index) - #absence candidates from test - ilocs = np.unique(np.random.randint(0, len(test_df_h3), 10_000)) - sample_occupancy_pseudoabsences = [] - for i in ilocs: - row = test_df_h3.iloc[i] - - if row.name in cutoff_grid_cell_indices: - if taxon_id in full_spatial_data_lookup_table.loc[row.name].taxon_id: - #candidate in cell containing target taxon so can't be absence - pass - else: - #candidate in cell not containing target taxon and - #count is greater than count cutoff. Use as absence - sample_occupancy_pseudoabsences.append( - (row.lat, row.lng) - ) + def features_for_one_class_elevation(self, latitude, longitude, elevation): + """Evalutes the model for a single class and multiple locations + + Args: + latitude (list): A list of latitudes + longitude (list): A list of longitudes (same length as latitude) + elevation (list): A list of elevations (same length as latitude) + class_of_interest (int): The single class to eval + + Returns: + numpy array: scores for class of interest at each location + """ + def encode_loc(latitude, longitude, elevation): + latitude = np.array(latitude) + longitude = np.array(longitude) + elevation = np.array(elevation) + elevation = elevation.astype("float32") + grid_lon = longitude.astype('float32') / 180.0 + grid_lat = latitude.astype('float32') / 90.0 + + elevation[elevation>0] = elevation[elevation>0]/6574.0 + elevation[elevation<0] = elevation[elevation<0]/32768.0 + norm_elev = elevation + + if np.isscalar(grid_lon): + grid_lon = np.array([grid_lon]) + if np.isscalar(grid_lat): + grid_lat = np.array([grid_lat]) + if np.isscalar(norm_elev): + norm_elev = np.array([norm_elev]) + + norm_loc = tf.stack([grid_lon, grid_lat], axis=1) + + encoded_loc = tf.concat([ + tf.sin(norm_loc * math.pi), + tf.cos(norm_loc * math.pi), + tf.expand_dims(norm_elev, axis=1), + + ], axis=1) + + return encoded_loc + + encoded_loc = encode_loc(latitude, longitude, elevation) + loc_emb = self.gpmodel.layers[0](encoded_loc) + + # res layers - feature extraction + x = self.gpmodel.layers[1](loc_emb) + x = self.gpmodel.layers[2](x) + x = self.gpmodel.layers[3](x) + x = self.gpmodel.layers[4](x) - #limit number of absences - if len(sample_occupancy_pseudoabsences) >= 500: - break + # process just the one class + return x + + def eval_one_class_elevation_from_features(self, x, class_of_interest): + return tf.keras.activations.sigmoid( + tf.matmul( + x, + tf.expand_dims(self.gpmodel.layers[5].weights[0][:,class_of_interest], axis=0), + transpose_b=True + ) + ).numpy() + +def ignore_shapely_deprecation_warning(message, category, filename, lineno, file=None, line=None): + if "array interface is deprecated" in str(message): + return None + return warnings.defaultaction(message, category, filename, lineno, file, line) + +def main(args): + print("loading in the model...") + mt = ModelTaxonomy(args.taxonomy) + tfgpm = TFGeoPriorModelEnv(args.model, mt) - sample_occupancy_pseudoabsences = pd.DataFrame( - sample_occupancy_pseudoabsences, - columns=["lat", "lng"] + print("setting up the map...") + warnings.showwarning = ignore_shapely_deprecation_warning + im = tifffile.imread(args.elevation) + im_df = pd.DataFrame(im) + im_df.index = np.linspace(90, -90, 2160) + im_df.columns = np.linspace(-180, 180, 4320) + im_df = im_df.reset_index() + im_df = im_df.melt( + id_vars=["index"], ) - sample_occupancy_pseudoabsences["taxon_id"] = taxon_id - return sample_occupancy_pseudoabsences - -def get_threshold(test, predictions): - - precision, recall, thresholds = precision_recall_curve(test, predictions) - p1 = (2 * precision * recall) - p2 = (precision + recall) - out = np.zeros( (len(p1)) ) - fscore = np.divide(p1,p2, out=out, where=p2!=0) - index = np.argmax(fscore) - thresholdOpt = round(thresholds[index], ndigits = 4) - prauc = auc(recall, precision) - return thresholdOpt, prauc - -def get_predictions(latitude,longitude,taxon_id, mt, tfgpm): - - class_of_interest = mt.node_key_to_leaf_class_id[taxon_id] - predictions = tfgpm.eval_one_class( - np.array(latitude), - np.array(longitude), - class_of_interest + im_df.columns = ["lat", "lng", "elevation"] + elev_dfh3 = im_df.h3.geo_to_h3(args.h3_resolution) + elev_dfh3 = elev_dfh3.drop( + columns=['lng', 'lat'] + ).groupby("h3_0"+str(args.h3_resolution)).mean() + gdfk = elev_dfh3.h3.h3_to_geo() + gdfk["lng"] = gdfk["geometry"].x + gdfk["lat"] = gdfk["geometry"].y + _ = gdfk.pop("geometry") + gdfk = gdfk.rename_axis('h3index') + + print("making features...") + feats = tfgpm.features_for_one_class_elevation( + latitude=list(gdfk.lat), + longitude=list(gdfk.lng), + elevation=list(gdfk.elevation) ) - return predictions -def get_predictions_py(latitude,longitude,taxon_id, net_params, model, torch): - - class_of_interest = net_params["params"]["class_to_taxa"].index(taxon_id) - grid_lon = torch.from_numpy(longitude/180) - grid_lat = torch.from_numpy(latitude/90) - grid_lon = grid_lon.repeat(1,1).unsqueeze(2) - grid_lat = grid_lat.repeat(1, 1).unsqueeze(2) - loc_ip = torch.cat((grid_lon, grid_lat), 2) - feats = torch.cat((torch.sin(math.pi*loc_ip), torch.cos(math.pi*loc_ip)), 2) - model.eval() - with torch.no_grad(): - pred = model(feats, class_of_interest=class_of_interest) - predictions = pred.cpu().numpy()[0] - return predictions - -def main(args): - - print("loading in test and train data...") - test_df = pd.read_csv(args.test_spatial_data_path, - usecols=["taxon_id","latitude","longitude"]).rename({ - "latitude": "lat", - "longitude": "lng" - }, axis=1) - train_df = pd.read_csv(args.train_spatial_data_path, - usecols=["taxon_id","latitude","longitude"]).rename({ + print("loading in the training data...") + train_df = pd.read_csv(args.train_spatial_data, + usecols=["taxon_id","latitude","longitude","captive"]).rename({ "latitude": "lat", "longitude": "lng" }, axis=1) - taxon_ids = test_df.taxon_id.unique() - if args.stop_after is not None: - taxon_ids = taxon_ids[0:args.stop_after] - - print("calculating absences...") - test_df_h3 = test_df.h3.geo_to_h3(args.h3_resolution) + train_df = train_df[train_df.captive==0] #no-CID ok, wild only + train_df.drop(["captive"],axis=1) train_df_h3 = train_df.h3.geo_to_h3(args.h3_resolution) - full_spatial_data_lookup_table = pd.concat([train_df_h3, test_df_h3]).pivot_table( - index="h3_0"+str(args.h3_resolution), - values="taxon_id", - aggfunc=set - ) - pseudoabsence_df = pd.DataFrame() all_spatial_grid_counts = train_df_h3.index.value_counts() - for taxon_id in tqdm(taxon_ids): - pa = pseudo_absences_for_taxon_id( - taxon_id, - all_spatial_grid_counts, - train_df_h3, - test_df_h3, - full_spatial_data_lookup_table - ) - pseudoabsence_df = pd.concat([pseudoabsence_df, pa]) - - print("calculating thresholds...") - if args.model_type == "tf": - from lib.taxon import Taxon - from lib.model_taxonomy import ModelTaxonomy - from lib.tf_gp_model import TFGeoPriorModel - mt = ModelTaxonomy(args.taxonomy_path) - tfgpm = TFGeoPriorModel(args.model_path, mt) - elif args.model_type == "pytorch": - import torch - sys.path.append("../geo_prior_inat") - from geo_prior import models - net_params = torch.load(args.model_path, map_location="cpu") - net_params["params"]['device'] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model_name = models.select_model(net_params["params"]["model"]) - model = model_name(num_inputs=net_params["params"]['num_feats'], - num_classes=net_params["params"]['num_classes'], - num_filts=net_params["params"]['num_filts'], - num_users=net_params["params"]['num_users'], - num_context=net_params["params"]['num_context']).to(net_params["params"]['device']) - model.load_state_dict(net_params['state_dict']) + presence_absence = pd.DataFrame({ + "background": all_spatial_grid_counts, + }) + presence_absence = presence_absence.fillna(0) + + print("...looping through taxa") output = [] + taxa = pd.read_csv(args.taxonomy, usecols=["taxon_id","leaf_class_id","iconic_class_id"]).dropna(subset=['leaf_class_id']) + taxon_ids = taxa.taxon_id + if args.stop_after is not None: + taxon_ids = taxon_ids[0:args.stop_after] + desired_recall = 0.95 + resolution = args.h3_resolution + area = h3.hex_area(resolution) for taxon_id in tqdm(taxon_ids): - absences = pseudoabsence_df[pseudoabsence_df.taxon_id==taxon_id] - presences = test_df[test_df.taxon_id==taxon_id] - test = np.concatenate(([1] * len(presences), [0] * len(absences))) - - longitude = np.concatenate(( - presences.lng.values.astype('float32'), - absences.lng.values.astype('float32') - )) - - latitude = np.concatenate(( - presences.lat.values.astype('float32'), - absences.lat.values.astype('float32') - )) - - if args.model_type == "tf": - predictions = get_predictions(latitude, longitude, taxon_id, mt, tfgpm) - elif args.model_type == "pytorch": - predictions = get_predictions_py(latitude, longitude, taxon_id, net_params, model, torch) - threshold, prauc = get_threshold(test, predictions) - + try: + class_of_interest = mt.node_key_to_leaf_class_id[taxon_id] + except: + print('not in the model for some reason') + continue + + #get predictions + preds = tfgpm.eval_one_class_elevation_from_features(feats, class_of_interest) + gdfk["pred"] = tf.squeeze(preds).numpy() + + #make presence absence dataset + target_spatial_grid_counts = train_df_h3[train_df_h3.taxon_id==taxon_id].index.value_counts() + presences = gdfk.loc[target_spatial_grid_counts.index]["pred"] + if len(presences) == 0: + print("not present") + continue + + #calculate threhold + presence_absence["forground"] = target_spatial_grid_counts + presence_absence["predictions"] = gdfk["pred"] + presence_absence.forground = presence_absence.forground.fillna(0) + yield_cutoff = np.percentile((presence_absence["background"]/presence_absence["forground"])[presence_absence["forground"]>0], 95) + absences = presence_absence[(presence_absence["forground"]==0) & (presence_absence["background"] > yield_cutoff)]["predictions"] + presences = presence_absence[(presence_absence["forground"]>0)]["predictions"] + df_x = pd.DataFrame({'predictions': presences, 'test': 1}) + df_y = pd.DataFrame({'predictions': absences, 'test': 0}) + for_thres = pd.concat([df_x, df_y], ignore_index=False) + precision, recall, thresholds = precision_recall_curve(for_thres.test, for_thres.predictions) + p1 = (2 * precision * recall) + p2 = (precision + recall) + out = np.zeros( (len(p1)) ) + fscore = np.divide(p1,p2, out=out, where=p2!=0) + index = np.argmax(fscore) + thres = thresholds[index] + + #store daa row = { "taxon_id": taxon_id, - "threshold": threshold, - "auc": prauc, - "num_presences": len(presences), - "num_absences": len(absences) + "thres": thres, + "area": len(gdfk[gdfk.pred >= thres])*area } row_dict = dict(row) output.append(row_dict) - + print("writing output...") - pd.DataFrame(output).to_csv(args.output_path) + output_pd = pd.DataFrame(output) + output_pd.to_csv(args.output_dir+"/thresholds.csv") if __name__ == "__main__": info_str = '\nrun as follows\n' + \ - ' python generate_thresholds.py --model_type tf \n' + \ - ' --model_path tf_geoprior_1_4_r6.h5 \n' + \ - ' --taxonomy_path taxonomy_1_4.csv\n' + \ - ' --test_spatial_data_path test_spatial_data.csv\n' + \ - ' --train_spatial_data_path train_spatial_data.csv\n' + \ - ' --output_path thresholds.csv\n' + \ - ' --h3_resolution 3\n' + \ + ' python generate_thresholds.py --elevation wc2.1_5m_elev.tif \n' + \ + ' --model v2_6/tf_geoprior_2_5_r6_elevation.h5 \n' + \ + ' --taxonomy taxonomy_1_4.csv\n' + \ + ' --train_spatial_data v2_6/taxonomy.csv\n' + \ + ' --output_dir v2_6\n' + \ + ' --h3_resolution 4\n' + \ ' --stop_after 10\n' parser = argparse.ArgumentParser(usage=info_str) - parser.add_argument('--model_type', type=str, - help='Can be either "tf" or "pytorch".', required=True) - parser.add_argument('--model_path', type=str, + parser.add_argument('--elevation', type=str, + help='Path to elev tif.', required=True) + parser.add_argument('--model', type=str, help='Path to tf model.', required=True) - parser.add_argument('--taxonomy_path', type=str, + parser.add_argument('--taxonomy', type=str, help='Path to taxonomy csv.', required=True) - parser.add_argument('--test_spatial_data_path', type=str, - help='Path to test csv for presence/absences.', required=True) - parser.add_argument('--train_spatial_data_path', type=str, + parser.add_argument('--train_spatial_data', type=str, help='Path to train csv for occupancy.', required=True) - parser.add_argument('--output_path', type=str, - help='Path to write thesholds.', required=True) - parser.add_argument('--h3_resolution', type=int, default=3, - help='grid resolution from 0 - 15, lower numbers are coarser/faster. Recommend 3, 4, or 5') + parser.add_argument('--output_dir', type=str, + help='directory to write thesholds.', required=True) + parser.add_argument('--h3_resolution', type=int, default=4, + help='grid resolution from 0 - 15, lower numbers are coarser/faster. Currently using 4') parser.add_argument('--stop_after', type=int, - help='just run the first x taxa') + help='just run the first x taxa') args = parser.parse_args() main(args) + \ No newline at end of file diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index 8e155f9..1a6fb67 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -1,11 +1,22 @@ -import os +import time import magic import tensorflow as tf +import pandas as pd +import h3 +import h3pandas # noqa: F401 +import math +import os +import tifffile +import numpy as np from PIL import Image -from lib.pt_geo_prior_model import PTGeoPriorModel -from lib.tf_gp_model import TFGeoPriorModel +from lib.tf_gp_elev_model import TFGeoPriorModelElev from lib.vision_inferrer import VisionInferrer -from lib.model_taxonomy import ModelTaxonomy +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe + +# TODO: better way to address the SettingWithCopyWarning warning? +pd.options.mode.chained_assignment = None + +MINIMUM_GEO_SCORE = 0.005 class InatInferrer: @@ -14,35 +25,367 @@ def __init__(self, config): self.config = config self.setup_taxonomy(config) self.setup_vision_model(config) + self.setup_elevation_dataframe(config) self.setup_geo_model(config) self.upload_folder = "static/" def setup_taxonomy(self, config): - self.taxonomy = ModelTaxonomy(config["taxonomy_path"]) + self.taxonomy = ModelTaxonomyDataframe( + config["taxonomy_path"], + config["tf_elev_thresholds"] if "tf_elev_thresholds" in config else None + ) def setup_vision_model(self, config): self.vision_inferrer = VisionInferrer(config["vision_model_path"], self.taxonomy) + def setup_elevation_dataframe(self, config): + self.geo_elevation_cells = None + # load elevation data stored at H3 resolution 4 + if "elevation_h3_r4" in config: + self.geo_elevation_cells = pd.read_csv(config["elevation_h3_r4"]). \ + sort_values("h3_04").set_index("h3_04").sort_index() + self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe(self.geo_elevation_cells) + + def setup_elevation_dataframe_from_worldclim(self, config, resolution): + # preventing from processing at too high a resolution + if resolution > 5: + return + if "wc2.1_5m_elev_2.tif" in config: + im = tifffile.imread(config["wc2.1_5m_elev_2.tif"]) + im_df = pd.DataFrame(im) + im_df.index = np.linspace(90, -90, 2160) + im_df.columns = np.linspace(-180, 180, 4320) + im_df = im_df.reset_index() + im_df = im_df.melt(id_vars=["index"]) + im_df.columns = ["lat", "lng", "elevation"] + elev_dfh3 = im_df.h3.geo_to_h3(resolution) + elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f'h3_0{resolution}').mean() + def setup_geo_model(self, config): - if "use_pt_gp_model" in config and config["use_pt_gp_model"] and "pt_geo_model_path" in config: - self.geo_model = PTGeoPriorModel(config["pt_geo_model_path"], self.taxonomy) - elif "tf_geo_model_path" in config: - self.geo_model = TFGeoPriorModel(config["tf_geo_model_path"], self.taxonomy) - else: - self.geo_model = None + self.geo_elevation_model = None + self.geo_model_features = None + if "tf_geo_elevation_model_path" in config and self.geo_elevation_cells is not None: + self.geo_elevation_model = TFGeoPriorModelElev(config["tf_geo_elevation_model_path"]) + self.geo_model_features = self.geo_elevation_model.features_for_one_class_elevation( + latitude=list(self.geo_elevation_cells.lat), + longitude=list(self.geo_elevation_cells.lng), + elevation=list(self.geo_elevation_cells.elevation) + ) - def prepare_image_for_inference(self, file_path, image_uuid): + def prepare_image_for_inference(self, file_path): mime_type = magic.from_file(file_path, mime=True) # attempt to convert non jpegs if mime_type != "image/jpeg": im = Image.open(file_path) - rgb_im = im.convert("RGB") - file_path = os.path.join(self.upload_folder, image_uuid) + ".jpg" - rgb_im.save(file_path) - - image = tf.io.read_file(file_path) - image = tf.image.decode_jpeg(image, channels=3) + image = im.convert("RGB") + else: + image = tf.io.read_file(file_path) + image = tf.image.decode_jpeg(image, channels=3) image = tf.image.convert_image_dtype(image, tf.float32) image = tf.image.central_crop(image, 0.875) image = tf.image.resize(image, [299, 299], tf.image.ResizeMethod.NEAREST_NEIGHBOR) return tf.expand_dims(image, 0) + + def vision_predict(self, image, debug=False): + if debug: + start_time = time.time() + vision_scores = self.vision_inferrer.process_image(image) + if debug: + print("Vision Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return vision_scores + + def geo_model_predict(self, lat, lng, debug=False): + if debug: + start_time = time.time() + if lat is None or lat == "" or lng is None or lng == "": + return None + if self.geo_elevation_model is None: + return None + # lookup the H3 cell this lat lng occurs in + h3_cell = h3.geo_to_h3(float(lat), float(lng), 4) + h3_cell_centroid = h3.h3_to_geo(h3_cell) + # get the average elevation of the above H3 cell + elevation = self.geo_elevation_cells.loc[h3_cell].elevation + geo_scores = self.geo_elevation_model.predict( + h3_cell_centroid[0], h3_cell_centroid[1], float(elevation)) + if debug: + print("Geo Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return geo_scores + + def lookup_taxon(self, taxon_id): + if taxon_id is None: + return None + try: + return self.taxonomy.df.loc[taxon_id] + except Exception as e: + print(f'taxon `{taxon_id}` does not exist in the taxonomy') + raise e + + def predictions_for_image(self, file_path, lat, lng, filter_taxon, score_without_geo=False, + debug=False): + if debug: + start_time = time.time() + image = self.prepare_image_for_inference(file_path) + raw_vision_scores = self.vision_predict(image, debug) + raw_geo_scores = self.geo_model_predict(lat, lng, debug) + top100 = self.combine_results(raw_vision_scores, raw_geo_scores, filter_taxon, + score_without_geo, debug) + if debug: + print("Prediction Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return top100 + + def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, + score_without_geo=False, debug=False): + if debug: + start_time = time.time() + no_geo_scores = (raw_geo_scores is None) + + # make a copy of the model taxonomy leaf nodes to be used for storing results. Skip any + # filtering at this stage as the taxonomy dataframe needs to have the same number of + # leaf taxa and in the same order as the raw scores + leaf_scores = self.taxonomy.leaf_df.copy() + # add a column for vision scores + leaf_scores["vision_score"] = raw_vision_scores + # add a column for geo scores + leaf_scores["geo_score"] = 0 if no_geo_scores else raw_geo_scores + # set a lower limit for geo scores if there are any + leaf_scores["normalized_geo_score"] = 0 if no_geo_scores \ + else leaf_scores["geo_score"].clip(MINIMUM_GEO_SCORE, None) + + # if filtering by a taxon, restrict results to that taxon and its descendants + if filter_taxon is not None: + # using nested set left and right values, select the filter taxon and its descendants + leaf_scores = leaf_scores.query( + f'left >= {filter_taxon["left"]} and right <= {filter_taxon["right"]}' + ) + # normalize the vision scores so they add up to 1 after filtering + sum_of_vision_scores = leaf_scores["vision_score"].sum() + leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] / sum_of_vision_scores + else: + # when not filtering by a taxon, the normalized vision score is the same as the original + leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] + + if no_geo_scores or score_without_geo: + # if there are no geo scores, or it was requested to not use geo scores to affect + # the final combined score, set the combined scores to be the same as the vision scores + leaf_scores["combined_score"] = leaf_scores["normalized_vision_score"] + else: + # the combined score is simply the normalized vision score + # multipliedby the normalized geo score + leaf_scores["combined_score"] = leaf_scores["normalized_vision_score"] * \ + leaf_scores["normalized_geo_score"] + if debug: + print("Score Combining Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return leaf_scores + + def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, debug=False): + if debug: + start_time = time.time() + + no_geo_scores = (leaf_scores["geo_score"].max() == 0) + # make a copy of the full taxonomy including non-leaves to be used for storing results + if filter_taxon is not None: + # using nested set left and right values, select the filter taxon, + # its descendants, and its ancestors + all_node_scores = self.taxonomy.df.query( + f'(left >= {filter_taxon["left"]} and right <= {filter_taxon["right"]}) or' + + f'(left < {filter_taxon["left"]} and right > {filter_taxon["right"]})' + ).copy().reset_index(drop=True) + else: + all_node_scores = self.taxonomy.df.copy().reset_index(drop=True) + + # copy the score columns from the already-calculated leaf scores + all_node_scores = pd.merge(all_node_scores, leaf_scores[[ + "taxon_id", "vision_score", "normalized_vision_score", "geo_score", + "normalized_geo_score"]], on="taxon_id", how="left").set_index("taxon_id", drop=False) + + # calculate the highest combined score + top_combined_score = leaf_scores.sort_values( + "combined_score", ascending=False).head(1)["combined_score"].values[0] + lower_cutoff_threshold = 0.0001 + # determine a lower-bound cutoff where results with combined scores below this cutoff + # will be ignored. This isn't necessary for scoring, but it improves performance + # TODO: evaluate this + cutoff = max([0.00001, top_combined_score * lower_cutoff_threshold]) + + aggregated_scores = {} + # restrict score aggregation to results where the combined score is above the cutoff + scores_to_aggregate = leaf_scores.query(f'combined_score > {cutoff}') + # loop through all results where the combined score is above the cutoff + for taxon_id, vision_score, geo_score, geo_threshold in zip( + scores_to_aggregate["taxon_id"], + scores_to_aggregate["normalized_vision_score"], + scores_to_aggregate["normalized_geo_score"], + scores_to_aggregate["geo_threshold"] + ): + # loop through the pre-calculated ancestors of this result's taxon + for ancestor_taxon_id in self.taxonomy.taxon_ancestors[taxon_id]: + # set default values for the ancestor the first time it is referenced + if ancestor_taxon_id not in aggregated_scores: + aggregated_scores[ancestor_taxon_id] = {} + aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] = 0 + if not no_geo_scores: + aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = 0 + aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 100 + # aggregated vision score is a sum of descendant scores + aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] += vision_score + if not no_geo_scores and geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: + # aggregated geo score is the max of descendant geo scores + aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = geo_score + if not no_geo_scores and \ + aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] != 0 and \ + geo_score > geo_threshold: + # aggregated geo threshold is set to 0 if any descendants are above their threshold + aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 0 + + # turn the aggregated_scores dict into a data frame + scores_df = pd.DataFrame.from_dict(aggregated_scores, orient="index") + # merge the aggregated scores into the scoring taxonomy + all_node_scores = all_node_scores.join(scores_df) + + # the aggregated scores of leaves will be NaN, so populate them with their original scores + all_node_scores["aggregated_vision_score"].fillna( + all_node_scores["normalized_vision_score"], inplace=True) + if no_geo_scores: + all_node_scores["aggregated_geo_score"] = 0 + all_node_scores["aggregated_geo_threshold"] = 0 + else: + all_node_scores["aggregated_geo_score"].fillna( + all_node_scores["normalized_geo_score"], inplace=True) + all_node_scores["aggregated_geo_threshold"].fillna( + all_node_scores["geo_threshold"], inplace=True) + + if (no_geo_scores or score_without_geo): + # if there are no geo scores, or it was requested to not use geo scores to affect + # the final combined score, set the combined scores to be the same as the vision scores + all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] + else: + # the combined score is simply the normalized vision score + # multipliedby the normalized geo score + all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] * \ + all_node_scores["aggregated_geo_score"] + + # calculate a normalized combined score so all values add to 1, to be used for thresholding + sum_of_root_node_aggregated_combined_scores = all_node_scores.query( + "parent_taxon_id.isnull()")["aggregated_combined_score"].sum() + all_node_scores["normalized_aggregated_combined_score"] = all_node_scores[ + "aggregated_combined_score"] / sum_of_root_node_aggregated_combined_scores + + if debug: + print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + thresholded_results = all_node_scores.query("normalized_aggregated_combined_score > 0.05") + print("\nTree of aggregated results:") + ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( + lambda row: f'{row.name} [' + + f'V:{round(row.aggregated_vision_score, 4)}, ' + + f'G:{round(row.aggregated_geo_score, 4)}, ' + + f'C:{round(row.aggregated_combined_score, 4)}, ' + + f'NC:{round(row.normalized_aggregated_combined_score, 4)}]')) + print("") + return all_node_scores + + def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, raw_results=False): + if (self.geo_elevation_cells is None) or (self.geo_elevation_model is None): + return + try: + taxon = self.taxonomy.df.loc[taxon_id] + except Exception as e: + print(f'taxon `{taxon_id}` does not exist in the taxonomy') + raise e + if math.isnan(taxon["leaf_class_id"]): + return + + geo_scores = self.geo_elevation_model.eval_one_class_elevation_from_features( + self.geo_model_features, int(taxon["leaf_class_id"])) + geo_score_cells = self.geo_elevation_cells.copy() + geo_score_cells["geo_score"] = tf.squeeze(geo_scores).numpy() + if thresholded: + geo_score_cells = geo_score_cells.query(f'geo_score > {taxon["geo_threshold"]}') + else: + # return scores more than 10% of the taxon threshold, or more than 0.0001, whichever + # is smaller. This reduces data needed to be redendered client-side for the Data Layer + # mapping approach, and maybe can be removed once switching to map tiles + lower_bound_score = np.array([0.0001, taxon["geo_threshold"] / 10]).min() + geo_score_cells = geo_score_cells.query(f'geo_score > {lower_bound_score}') + + if bounds: + min = geo_score_cells["geo_score"].min() + max = geo_score_cells["geo_score"].max() + geo_score_cells = InatInferrer.filter_geo_dataframe_by_bounds(geo_score_cells, bounds) + # perform a log transform on the scores based on the min/max score for the unbounded set + geo_score_cells["geo_score"] = \ + (np.log10(geo_score_cells["geo_score"]) - math.log10(min)) / \ + (math.log10(max) - math.log10(min)) + + if raw_results: + return geo_score_cells + return dict(zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"])) + + def h3_04_taxon_range(self, taxon_id, bounds=[]): + taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f'{taxon_id}.csv') + if not os.path.exists(taxon_range_path): + return None + taxon_range_df = pd.read_csv(taxon_range_path, names=["h3_04"], header=None). \ + sort_values("h3_04").set_index("h3_04").sort_index() + taxon_range_df = InatInferrer.add_lat_lng_to_h3_geo_dataframe(taxon_range_df) + if bounds: + taxon_range_df = InatInferrer.filter_geo_dataframe_by_bounds(taxon_range_df, bounds) + taxon_range_df["value"] = 1 + return dict(zip(taxon_range_df.index.astype(str), taxon_range_df["value"])) + + def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): + geomodel_results = self.h3_04_geo_results_for_taxon(taxon_id, bounds, thresholded=True) or {} + taxon_range_results = self.h3_04_taxon_range(taxon_id, bounds) or {} + combined_results = {} + for cell_key in geomodel_results: + if cell_key in taxon_range_results: + combined_results[cell_key] = 0.5 + else: + combined_results[cell_key] = 0 + for cell_key in taxon_range_results: + if cell_key not in geomodel_results: + combined_results[cell_key] = 1 + return combined_results + + def h3_04_bounds(self, taxon_id): + geomodel_results = self.h3_04_geo_results_for_taxon( + taxon_id, bounds=None, thresholded=True, raw_results=True) + if geomodel_results is None: + return + return { + "swlat": geomodel_results["lat"].min(), + "swlng": geomodel_results["lng"].min(), + "nelat": geomodel_results["lat"].max(), + "nelng": geomodel_results["lng"].max() + } + + @staticmethod + def add_lat_lng_to_h3_geo_dataframe(geo_df): + geo_df = geo_df.h3.h3_to_geo() + geo_df["lng"] = geo_df["geometry"].x + geo_df["lat"] = geo_df["geometry"].y + geo_df.pop("geometry") + return geo_df + + @staticmethod + def filter_geo_dataframe_by_bounds(geo_df, bounds): + # this is querying on the centroid, but cells just outside the bounds may have a + # centroid outside the bounds while part of the polygon is within the bounds. Add + # a small buffer to ensure this returns any cell whose polygon is + # even partially within the bounds + buffer = 0.6 + + # similarly, the centroid may be on the other side of the antimedirian, so lookup + # cells that might be just over the antimeridian on either side + antimedirian_condition = "" + if bounds[1] < -179: + antimedirian_condition = "or (lng > 179)" + elif bounds[3] > 179: + antimedirian_condition = "or (lng < -179)" + + # query for cells wtihin the buffered bounds, and potentially + # on the other side of the antimeridian + query = f'lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and ' + \ + f' ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})' + \ + f' {antimedirian_condition})' + return geo_df.query(query) diff --git a/lib/inat_vision_api.py b/lib/inat_vision_api.py index 1cdb8f1..8703c43 100644 --- a/lib/inat_vision_api.py +++ b/lib/inat_vision_api.py @@ -8,7 +8,6 @@ from flask import Flask, request, render_template from web_forms import ImageForm from inat_inferrer import InatInferrer -from model_scoring import ModelScoring class InatVisionAPI: @@ -18,12 +17,59 @@ def __init__(self, config): self.app = Flask(__name__) self.app.secret_key = config["app_secret"] self.upload_folder = "static/" - self.app.add_url_rule( - "/", "index", self.index_route, methods=["GET", "POST"]) + self.app.add_url_rule("/", "index", self.index_route, methods=["GET", "POST"]) + self.app.add_url_rule("/h3_04", "h3_04", self.h3_04_route, methods=["GET"]) + self.app.add_url_rule("/h3_04_taxon_range", "h3_04_taxon_range", + self.h3_04_taxon_range_route, methods=["GET"]) + self.app.add_url_rule("/h3_04_taxon_range_comparison", "h3_04_taxon_range_comparison", + self.h3_04_taxon_range_comparison_route, methods=["GET"]) + self.app.add_url_rule("/h3_04_bounds", "h3_04_bounds", + self.h3_04_bounds_route, methods=["GET"]) def setup_inferrer(self, config): self.inferrer = InatInferrer(config) + def h3_04_route(self): + return self.h3_04_default_route(self.inferrer.h3_04_geo_results_for_taxon) + + def h3_04_taxon_range_route(self): + if "taxon_ranges_path" not in self.inferrer.config: + return "taxon range data unavilable", 422 + return self.h3_04_default_route(self.inferrer.h3_04_taxon_range) + + def h3_04_taxon_range_comparison_route(self): + if "taxon_ranges_path" not in self.inferrer.config: + return "taxon range data unavilable", 422 + return self.h3_04_default_route(self.inferrer.h3_04_taxon_range_comparison) + + def h3_04_default_route(self, h3_04_method): + taxon_id, error_message, error_code = self.valid_leaf_taxon_id_for_request(request) + if error_message: + return error_message, error_code + + bounds, error_message, error_code = self.valid_bounds_for_request(request) + if error_message: + return error_message, error_code + + if h3_04_method == self.inferrer.h3_04_geo_results_for_taxon \ + and "thresholded" in request.args and request.args["thresholded"] == "true": + results_dict = h3_04_method(taxon_id, bounds, thresholded=True) + else: + results_dict = h3_04_method(taxon_id, bounds) + if results_dict is None: + return f'Unknown taxon_id {taxon_id}', 422 + return InatVisionAPI.round_floats(results_dict, 8) + + def h3_04_bounds_route(self): + taxon_id, error_message, error_code = self.valid_leaf_taxon_id_for_request(request) + if error_message: + return error_message, error_code + + results_dict = self.inferrer.h3_04_bounds(taxon_id) + if results_dict is None: + return f'Unknown taxon_id {taxon_id}', 422 + return results_dict + def index_route(self): form = ImageForm() if "observation_id" in request.args: @@ -41,6 +87,7 @@ def index_route(self): lng = form.lng.data file_path = None image_uuid = None + iconic_taxon_id = None if observation_id: image_uuid = "downloaded-obs-" + observation_id file_path, lat, lng, iconic_taxon_id = self.download_observation( @@ -48,67 +95,95 @@ def index_route(self): else: image_uuid = str(uuid.uuid4()) file_path = self.process_upload(form.image.data, image_uuid) - iconic_taxon_id = None + if form.taxon_id.data and form.taxon_id.data.isdigit(): + iconic_taxon_id = int(form.taxon_id.data) if file_path is None: return render_template("home.html") - image = self.inferrer.prepare_image_for_inference(file_path, image_uuid) - scores = self.score_image(image, lat, lng, iconic_taxon_id, geomodel) - self.write_logstash(image_uuid, file_path, request_start_datetime, request_start_time) + scores = self.score_image(form, file_path, lat, lng, iconic_taxon_id, geomodel) + InatVisionAPI.write_logstash( + image_uuid, file_path, request_start_datetime, request_start_time) return scores else: return render_template("home.html") - def score_image(self, image, lat, lng, iconic_taxon_id, geomodel): - # Vision - vision_start_time = time.time() - vision_scores = self.inferrer.vision_inferrer.process_image(image, iconic_taxon_id) - vision_total_time = time.time() - vision_start_time - print("Vision Time: %0.2fms" % (vision_total_time * 1000.)) - - if geomodel != "true": - top_x = dict(sorted( - vision_scores.items(), key=lambda x: x[1], reverse=True)[:100]) - to_return = {} - for index, arg in enumerate(top_x): - to_return[arg] = round(vision_scores[arg] * 100, 6) - return to_return - - # Geo - geo_start_time = time.time() - if lat is not None and lat != "" and lng is not None and lng != "": - geo_scores = self.inferrer.geo_model.predict(lat, lng, iconic_taxon_id) + def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): + score_without_geo = (form.score_without_geo.data == "true") + filter_taxon = self.inferrer.lookup_taxon(iconic_taxon_id) + leaf_scores = self.inferrer.predictions_for_image( + file_path, lat, lng, filter_taxon, score_without_geo + ) + + if form.aggregated.data == "true": + aggregated_results = self.inferrer.aggregate_results(leaf_scores, filter_taxon, + score_without_geo) + columns_to_return = [ + "aggregated_combined_score", + "aggregated_geo_score", + "taxon_id", + "name", + "aggregated_vision_score", + "aggregated_geo_threshold" + ] + column_mapping = { + "taxon_id": "id", + "aggregated_combined_score": "combined_score", + "aggregated_geo_score": "geo_score", + "aggregated_vision_score": "vision_score", + "aggregated_geo_threshold": "geo_threshold" + } + + no_geo_scores = (leaf_scores["geo_score"].max() == 0) + + # set a cutoff where branches whose combined scores are below the threshold are ignored + # TODO: this threshold is completely arbitrary and needs testing + aggregated_results = aggregated_results.query("normalized_aggregated_combined_score > 0.05") + + # after setting a cutoff, get the parent IDs of the remaining taxa + parent_taxon_ids = aggregated_results["parent_taxon_id"].values # noqa: F841 + # the leaves of the pruned taxonomy (not leaves of the original taxonomy), are the + # taxa who are not parents of any remaining taxa + leaf_results = aggregated_results.query("taxon_id not in @parent_taxon_ids") + + leaf_results = leaf_results.sort_values("aggregated_combined_score", ascending=False).head(100) + score_columns = ["aggregated_combined_score", "aggregated_geo_score", + "aggregated_vision_score", "aggregated_geo_threshold"] + leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index") + final_results = leaf_results[columns_to_return].rename(columns=column_mapping) else: - geo_scores = {} - geo_total_time = time.time() - geo_start_time - print("GeoTime: %0.2fms" % (geo_total_time * 1000.)) - - # Scoring - scoring_start_time = time.time() - combined_scores = ModelScoring.combine_vision_and_geo_scores( - vision_scores, geo_scores, - self.inferrer.config["geo_min"] if "geo_min" in self.inferrer.config else 0) - - # results.aggregate_scores() - scoring_total_time = time.time() - scoring_start_time - print("Score Time: %0.2fms" % (scoring_total_time * 1000.)) - - top_x = dict(sorted( - combined_scores.items(), key=lambda x: x[1], reverse=True)[:100]) - to_return = [] - for index, arg in enumerate(top_x): - geo_score = geo_scores[arg] if arg in geo_scores else 0.0000000001 - to_return.append({ - "combined_score": round(combined_scores[arg] * 100, 6), - "vision_score": round(vision_scores[arg] * 100, 6), - "geo_score": round(geo_score * 100, 6), - "id": self.inferrer.taxonomy.taxa[arg].id, - "name": self.inferrer.taxonomy.taxa[arg].name - }) - - total_time = time.time() - vision_start_time - print("Total: %0.2fms" % (total_time * 1000.)) - return to_return + no_geo_scores = (leaf_scores["geo_score"].max() == 0) + top_combined_score = leaf_scores.sort_values("combined_score", ascending=False).head(1)["combined_score"].values[0] + # set a cutoff so results whose combined scores are + # much lower than the best score are not returned + leaf_scores = leaf_scores.query(f'combined_score > {top_combined_score * 0.001}') + + top100 = leaf_scores.sort_values("combined_score", ascending=False).head(100) + score_columns = ["combined_score", "geo_score", "normalized_vision_score", "geo_threshold"] + top100[score_columns] = top100[score_columns].multiply(100, axis="index") + + # legacy dict response + if geomodel != "true": + top_taxon_combined_scores = top100[ + ["taxon_id", "combined_score"] + ].to_dict(orient="records") + return {x["taxon_id"]: x["combined_score"] for x in top_taxon_combined_scores} + + # new array response + columns_to_return = [ + "combined_score", + "geo_score", + "taxon_id", + "name", + "normalized_vision_score", + "geo_threshold" + ] + column_mapping = { + "taxon_id": "id", + "normalized_vision_score": "vision_score" + } + final_results = top100[columns_to_return].rename(columns=column_mapping) + + return final_results.to_dict(orient="records") def process_upload(self, form_image_data, image_uuid): if form_image_data is None: @@ -137,7 +212,34 @@ def download_observation(self, observation_id, image_uuid): # return the path to the cached image, coordinates, and iconic taxon return cache_path, latlng[0], latlng[1], data["results"][0]["taxon"]["iconic_taxon_id"] - def write_logstash(self, image_uuid, file_path, request_start_datetime, request_start_time): + def valid_leaf_taxon_id_for_request(self, request): + if "taxon_id" in request.args: + taxon_id = request.args["taxon_id"] + else: + return None, "taxon_id required", 422 + if not taxon_id.isdigit(): + return None, "taxon_id must be an integer", 422 + + taxon_id = int(taxon_id) + if float(taxon_id) not in self.inferrer.taxonomy.leaf_df["taxon_id"].values: + return None, f'Unknown taxon_id {taxon_id}', 422 + return taxon_id, None, None + + def valid_bounds_for_request(self, request): + bounds = [] + if "swlat" in request.args: + try: + swlat = float(request.args["swlat"]) + swlng = float(request.args["swlng"]) + nelat = float(request.args["nelat"]) + nelng = float(request.args["nelng"]) + except ValueError: + return None, "bounds must be floats", 422 + bounds = [swlat, swlng, nelat, nelng] + return bounds, None, None + + @staticmethod + def write_logstash(image_uuid, file_path, request_start_datetime, request_start_time): request_end_time = time.time() request_time = round((request_end_time - request_start_time) * 1000, 6) logstash_log = open('log/logstash.log', 'a') @@ -149,3 +251,13 @@ def write_logstash(self, image_uuid, file_path, request_start_datetime, request_ json.dump(log_data, logstash_log) logstash_log.write("\n") logstash_log.close() + + @staticmethod + def round_floats(o, sig): + if isinstance(o, float): + return round(o, sig) + if isinstance(o, dict): + return {k: InatVisionAPI.round_floats(v, sig) for k, v in o.items()} + if isinstance(o, (list, tuple)): + return [InatVisionAPI.round_floats(x, sig) for x in o] + return o diff --git a/lib/model_scoring.py b/lib/model_scoring.py deleted file mode 100644 index 645de55..0000000 --- a/lib/model_scoring.py +++ /dev/null @@ -1,11 +0,0 @@ -class ModelScoring: - - @staticmethod - def combine_vision_and_geo_scores(vision_scores, geo_scores, geo_min=0): - combined_scores = {} - for arg in vision_scores: - geo_score = geo_scores[arg] if arg in geo_scores else 0.0000000001 - if geo_min > 0 and geo_score < geo_min: - geo_score = geo_min - combined_scores[arg] = vision_scores[arg] * geo_score - return combined_scores diff --git a/lib/model_taxonomy.py b/lib/model_taxonomy.py index 0362587..d390516 100644 --- a/lib/model_taxonomy.py +++ b/lib/model_taxonomy.py @@ -11,7 +11,7 @@ def __init__(self, path): def load_mapping(self, path): self.node_key_to_leaf_class_id = {} self.leaf_class_to_taxon = {} - # there is no taxon with ID 0, but roots of the taxonomy with have a parent ID of 0, + # there is no taxon with ID 0, but roots of the taxonomy have a parent ID of 0, # so create a fake taxon of Life to represent the root of the entire tree self.taxa = {0: Taxon({"name": "Life", "depth": 0})} self.taxon_children = {} diff --git a/lib/model_taxonomy_dataframe.py b/lib/model_taxonomy_dataframe.py new file mode 100644 index 0000000..a707c7d --- /dev/null +++ b/lib/model_taxonomy_dataframe.py @@ -0,0 +1,74 @@ +import math +import pandas as pd + + +class ModelTaxonomyDataframe: + + def __init__(self, path, thresholds_path): + self.load_mapping(path, thresholds_path) + + def load_mapping(self, path, thresholds_path): + self.df = pd.read_csv(path) + # left and right will be used to store nested set indices + self.df["left"] = pd.Series([], dtype=object) + self.df["right"] = pd.Series([], dtype=object) + self.taxon_children = {} + self.taxon_row_mapping = {} + self.taxon_ancestors = {} + for index, taxon in self.df.iterrows(): + self.taxon_row_mapping[taxon["taxon_id"]] = index + parent_id = 0 if math.isnan(taxon["parent_taxon_id"]) else int(taxon["parent_taxon_id"]) + if parent_id not in self.taxon_children: + self.taxon_children[parent_id] = [] + self.taxon_children[parent_id].append(taxon["taxon_id"]) + self.assign_nested_values() + self.df = self.df.set_index("taxon_id", drop=False).sort_index() + if thresholds_path is not None: + thresholds = pd.read_csv(thresholds_path)[["taxon_id", "thres"]]. \ + rename(columns={"thres": "geo_threshold"}).set_index("taxon_id").sort_index() + self.df = self.df.join(thresholds) + + # create a data frame with just the leaf taxa using leaf_class_id as the index + self.leaf_df = self.df.query("leaf_class_id.notnull()").set_index( + "leaf_class_id", drop=False).sort_index() + + # calculate nested set left and right values. These can be later used for an efficient + # way to calculate if a taxon is an ancestor or descendant of another + def assign_nested_values(self, taxon_id=0, index=0, ancestor_taxon_ids=[]): + for child_id in self.taxon_children[taxon_id]: + self.df.at[self.taxon_row_mapping[child_id], "left"] = index + self.taxon_ancestors[child_id] = ancestor_taxon_ids + index += 1 + if child_id in self.taxon_children: + child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id] + index = self.assign_nested_values(child_id, index, child_ancestor_taxon_ids) + self.df.at[self.taxon_row_mapping[child_id], "right"] = index + index += 1 + return index + + @staticmethod + def children(df, taxon_id): + if taxon_id == 0: + return df.query("parent_taxon_id.isnull()") + return df.query(f'parent_taxon_id == {taxon_id}') + + @staticmethod + def print(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): + children = ModelTaxonomyDataframe.children(df, taxon_id) + index = 0 + if "aggregated_combined_score" in children: + children = children.sort_values("aggregated_combined_score", ascending=False) + else: + children = children.sort_values("name") + for row in children.itertuples(): + last_in_branch = (index == len(children) - 1) + index += 1 + icon = "└──" if last_in_branch else "├──" + prefixIcon = " " if last_in_branch else "│ " + print(f'{ancestor_prefix}{icon}', end="") + if display_taxon_lambda is None: + print(f'{row.name} :: {row.left}:{row.right}') + else: + print(display_taxon_lambda(row)) + if row.right != row.left + 1: + ModelTaxonomyDataframe.print(df, row.taxon_id, f"{ancestor_prefix}{prefixIcon}", display_taxon_lambda) diff --git a/lib/model_test_data_exporter.py b/lib/model_test_data_exporter.py new file mode 100644 index 0000000..df60bad --- /dev/null +++ b/lib/model_test_data_exporter.py @@ -0,0 +1,204 @@ +import pandas as pd +import requests +import json +import prison +import re +from datetime import datetime + + +class ModelTestDataExporter: + + def __init__(self, **args): + self.cmd_args = args + self.load_train_data_photo_ids() + + def load_train_data_photo_ids(self, train_data_paths=[]): + if not self.cmd_args["exclude_train_photos_path"]: + self.train_data_photo_ids = [] + return + self.train_data_photo_ids = pd.concat( + map(lambda x: pd.read_csv(x, usecols=["photo_id"]), + self.cmd_args["exclude_train_photos_path"]) + ).drop_duplicates("photo_id").set_index("photo_id").sort_index().index + + def generate_from_cmd_args(self): + additional_parameters = {} + if self.cmd_args["place_id"]: + additional_parameters["place_id"] = self.cmd_args["place_id"] + if self.cmd_args["taxon_id"]: + additional_parameters["taxon_id"] = self.cmd_args["taxon_id"] + currentDatetime = datetime.now() + timestamp = currentDatetime.strftime("%Y%m%d") + export_path = f'test-obs-{timestamp}' + if additional_parameters: + parameter_string = "-".join(map(lambda index: f'{index}-{additional_parameters[index]}', + additional_parameters)) + export_path += "-" + parameter_string + if "filename_suffix" in self.cmd_args and self.cmd_args["filename_suffix"]: + export_path += "-" + self.cmd_args["filename_suffix"] + export_path += ".csv" + self.generate_test_data(export_path, self.cmd_args["limit"], additional_parameters) + + def export_test_data_parameters(self, additional_parameters={}): + api_parameters = {} + api_parameters["quality_grade"] = "research,casual" + api_parameters["rank"] = "species" + api_parameters["photos"] = "true" + api_parameters["geo"] = "true" + api_parameters["identified"] = "true" + api_parameters["per_page"] = 200 + api_parameters["order_by"] = "random" + api_parameters["identifications"] = "most_agree" + api_parameters["ttl"] = -1 + api_parameters.update(additional_parameters) + print(api_parameters) + + fields = { + "quality_grade": True, + "observed_on_details": { + "date": True + }, + "location": True, + "photos": { + "url": True + }, + "community_taxon_id": True, + "taxon": { + "id": True, + "ancestor_ids": True, + "iconic_taxon_id": True + }, + "quality_metrics": { + "agree": True, + "metric": True + } + } + api_parameters["fields"] = prison.dumps(fields) + return api_parameters + + def process_api_response(self, api_parameters, used_observations): # noqa: C901 + response = requests.get("https://api.inaturalist.org/v2/observations", + params=api_parameters) + json_object = response.json() + useable_rows = [] + for row in json_object["results"]: + if row["uuid"] in used_observations: + continue + + # must have a taxon and observed_on_details + if not row["taxon"] or "observed_on_details" not in row \ + or not row["observed_on_details"] or "taxon" not in row \ + or "iconic_taxon_id" not in row["taxon"] or not row["taxon"]["iconic_taxon_id"]: + used_observations[row["uuid"]] = True + continue + + # must pass quality metrics except wild + metric_counts = {} + for metric in row["quality_metrics"]: + if metric["metric"] not in metric_counts: + metric_counts[metric["metric"]] = 0 + if metric["agree"]: + metric_counts[metric["metric"]] += 1 + else: + metric_counts[metric["metric"]] -= 1 + if ("location" in metric_counts and metric_counts["location"] < 0) \ + or ("evidence" in metric_counts and metric_counts["evidence"] < 0) \ + or ("date" in metric_counts and metric_counts["date"] < 0) \ + or ("recent" in metric_counts and metric_counts["recent"] < 0): + used_observations[row["uuid"]] = True + continue + + # check if any photos are included in the test data + photo_in_training_data = False + for photo in row["photos"]: + if photo["id"] in self.train_data_photo_ids: + photo_in_training_data = True + break + if photo_in_training_data is True: + used_observations[row["uuid"]] = True + continue + if re.search(r"\.jpe?g", row["photos"][0]["url"]) is None: + used_observations[row["uuid"]] = True + continue + + if row["quality_grade"] == "casual" and not (row["community_taxon_id"] and row["community_taxon_id"] == row["taxon"]["id"]): + used_observations[row["uuid"]] = True + continue + + useable_rows.append(row) + used_observations[row["uuid"]] = True + return useable_rows + + def generate_test_data(self, filename, max_results=5000, additional_parameters={}): + iterations_with_zero_results = 0 + rows_to_use = [] + used_observations = {} + api_parameters = self.export_test_data_parameters(additional_parameters) + + while len(rows_to_use) < max_results and iterations_with_zero_results < 5: + print(f'Fetching more results... {len(rows_to_use)} so far') + useable_rows = self.process_api_response(api_parameters, used_observations) + print(f'{len(useable_rows)} this batch') + if not useable_rows: + iterations_with_zero_results += 1 + continue + iterations_with_zero_results = 0 + rows_to_use += useable_rows + + columns = [ + "observation_id", + "observed_on", + "iconic_taxon_id", + "taxon_id", + "taxon_ancestry", + "lat", + "lng", + "photo_url" + ] + output_file = open(filename, "w") + output_file.write(",".join(columns) + "\n") + for row in rows_to_use[:max_results]: + [latitude, longitude] = row["location"].split(",") + columns_to_write = [ + row["uuid"], + row["observed_on_details"]["date"], + row["taxon"]["iconic_taxon_id"], + row["taxon"]["id"], + "/".join(map(str, row["taxon"]["ancestor_ids"])), + latitude, + longitude, + row["photos"][0]["url"].replace("square", "medium").replace("http://", "https://") + ] + output_file.write(",".join(map(str, columns_to_write)) + "\n") + + def generate_standard_set(self): + files_to_generate = { + "global": {}, + "fungi": {"taxon_id": 47170}, + "insecta": {"taxon_id": 47158}, + "mammalia": {"taxon_id": 40151}, + "plantae": {"taxon_id": 47126}, + "actinopterygii": {"taxon_id": 47178}, + "reptilia": {"taxon_id": 26036}, + "amphibia": {"taxon_id": 20978}, + "arachnida": {"taxon_id": 47119}, + "aves": {"taxon_id": 3}, + "animalia": {"taxon_id": 1}, + "mollusca": {"taxon_id": 47115}, + "north-america": {"place_id": 97394}, + "south-america": {"place_id": 97389}, + "europe": {"place_id": 97391}, + "asia": {"place_id": 97395}, + "africa": {"place_id": 97392}, + "oceania": {"place_id": 97393} + } + + currentDatetime = datetime.now() + timestamp = currentDatetime.strftime("%Y%m%d") + + for key in files_to_generate: + export_path = f'test-obs-{timestamp}-{key}' + if "filename_suffix" in self.cmd_args and self.cmd_args["filename_suffix"]: + export_path += "-" + self.cmd_args["filename_suffix"] + export_path += ".csv" + self.generate_test_data(export_path, self.cmd_args["limit"], files_to_generate[key]) diff --git a/lib/pt_geo_prior_model.py b/lib/pt_geo_prior_model.py index 218a0d0..b81b4e9 100644 --- a/lib/pt_geo_prior_model.py +++ b/lib/pt_geo_prior_model.py @@ -29,7 +29,7 @@ def predict(self, latitude, longitude, filter_taxon_id=None): filter_taxon = None if filter_taxon_id is not None: try: - filter_taxon = self.taxonomy.taxa[filter_taxon_id] + filter_taxon = self.taxonomy.df.iloc[filter_taxon_id] except Exception as e: print(f'filter_taxon `{filter_taxon_id}` does not exist in the taxonomy') raise e diff --git a/lib/templates/home.html b/lib/templates/home.html index baad79e..37244e9 100644 --- a/lib/templates/home.html +++ b/lib/templates/home.html @@ -13,11 +13,11 @@

Slim vs Legacy Model

ObservationID: -
-
+
+
or upload file: -
-
+
+
Lat:
Lng: @@ -31,11 +31,17 @@

Slim vs Legacy Model

-
-
+ + +
+
- - diff --git a/lib/tf_gp_elev_model.py b/lib/tf_gp_elev_model.py new file mode 100644 index 0000000..ad9ebbd --- /dev/null +++ b/lib/tf_gp_elev_model.py @@ -0,0 +1,133 @@ +import tensorflow as tf +import numpy as np +import math +import os + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + + +class ResLayer(tf.keras.layers.Layer): + def __init__(self): + super(ResLayer, self).__init__() + self.w1 = tf.keras.layers.Dense( + 256, + activation="relu", + kernel_initializer="he_normal" + ) + self.w2 = tf.keras.layers.Dense( + 256, + activation="relu", + kernel_initializer="he_normal" + ) + self.dropout = tf.keras.layers.Dropout(rate=0.5) + self.add = tf.keras.layers.Add() + + def call(self, inputs): + x = self.w1(inputs) + x = self.dropout(x) + x = self.w2(x) + x = self.add([x, inputs]) + return x + + def get_config(self): + return {} + + +class TFGeoPriorModelElev: + + def __init__(self, model_path): + # initialize the geo model for inference + tf.config.set_visible_devices([], 'GPU') + visible_devices = tf.config.get_visible_devices() + for device in visible_devices: + assert device.device_type != 'GPU' + self.gpmodel = tf.keras.models.load_model( + model_path, + custom_objects={'ResLayer': ResLayer}, + compile=False + ) + + def predict(self, latitude, longitude, elevation): + norm_lat = latitude / 90.0 + norm_lng = longitude / 180.0 + norm_loc = tf.stack([norm_lng, norm_lat]) + + if elevation > 0: + norm_elev = elevation / 6574 + elif elevation == 0: + norm_elev = 0.0 + else: + norm_elev = elevation / 32768 + + norm_elev = tf.expand_dims(norm_elev, axis=0) + encoded_loc = tf.concat([ + tf.sin(norm_loc * math.pi), + tf.cos(norm_loc * math.pi), + norm_elev + ], axis=0) + + return self.gpmodel(tf.convert_to_tensor( + tf.expand_dims(encoded_loc, axis=0) + ), training=False)[0] + + def features_for_one_class_elevation(self, latitude, longitude, elevation): + """Evalutes the model for a single class and multiple locations + + Args: + latitude (list): A list of latitudes + longitude (list): A list of longitudes (same length as latitude) + elevation (list): A list of elevations (same length as latitude) + class_of_interest (int): The single class to eval + + Returns: + numpy array: scores for class of interest at each location + """ + def encode_loc(latitude, longitude, elevation): + latitude = np.array(latitude) + longitude = np.array(longitude) + elevation = np.array(elevation) + elevation = elevation.astype("float32") + grid_lon = longitude.astype('float32') / 180.0 + grid_lat = latitude.astype('float32') / 90.0 + + elevation[elevation > 0] = elevation[elevation > 0] / 6574.0 + elevation[elevation < 0] = elevation[elevation < 0] / 32768.0 + norm_elev = elevation + + if np.isscalar(grid_lon): + grid_lon = np.array([grid_lon]) + if np.isscalar(grid_lat): + grid_lat = np.array([grid_lat]) + if np.isscalar(norm_elev): + norm_elev = np.array([norm_elev]) + + norm_loc = tf.stack([grid_lon, grid_lat], axis=1) + + encoded_loc = tf.concat([ + tf.sin(norm_loc * math.pi), + tf.cos(norm_loc * math.pi), + tf.expand_dims(norm_elev, axis=1), + + ], axis=1) + return encoded_loc + + encoded_loc = encode_loc(latitude, longitude, elevation) + loc_emb = self.gpmodel.layers[0](encoded_loc) + + # res layers - feature extraction + x = self.gpmodel.layers[1](loc_emb) + x = self.gpmodel.layers[2](x) + x = self.gpmodel.layers[3](x) + x = self.gpmodel.layers[4](x) + + return x + + def eval_one_class_elevation_from_features(self, features, class_of_interest): + # process just the one class + return tf.keras.activations.sigmoid( + tf.matmul( + tf.expand_dims(self.gpmodel.layers[5].weights[0][:, class_of_interest], axis=0), + features, + transpose_b=True + ) + ).numpy() diff --git a/lib/tf_gp_model.py b/lib/tf_gp_model.py index dd542d3..1e12ade 100644 --- a/lib/tf_gp_model.py +++ b/lib/tf_gp_model.py @@ -44,15 +44,7 @@ def __init__(self, model_path, taxonomy): compile=False ) - def predict(self, latitude, longitude, filter_taxon_id=None): - filter_taxon = None - if filter_taxon_id is not None: - try: - filter_taxon = self.taxonomy.taxa[filter_taxon_id] - except Exception as e: - print(f'filter_taxon `{filter_taxon_id}` does not exist in the taxonomy') - raise e - + def predict(self, latitude, longitude): norm_lat = np.array([float(latitude)]) / 90.0 norm_lng = np.array([float(longitude)]) / 180.0 norm_loc = tf.stack([norm_lng, norm_lat], axis=1) @@ -60,22 +52,7 @@ def predict(self, latitude, longitude, filter_taxon_id=None): tf.sin(norm_loc * math.pi), tf.cos(norm_loc * math.pi) ], axis=1) - - preds = self.gpmodel.predict([encoded_loc], verbose=0)[0] - geo_pred_dict = {} - for index, pred in enumerate(preds): - if index not in self.taxonomy.leaf_class_to_taxon: - continue - taxon_id = self.taxonomy.leaf_class_to_taxon[index] - if filter_taxon_id is not None: - taxon = self.taxonomy.taxa[taxon_id] - # the predicted taxon is not the filter_taxon or a descendent, so skip it - if not taxon.is_or_descendant_of(filter_taxon): - continue - - geo_pred_dict[taxon_id] = pred - - return geo_pred_dict + return self.gpmodel.predict([encoded_loc], verbose=0)[0] def eval_one_class(self, latitude, longitude, class_of_interest): """Evalutes the model for a single class and multiple locations diff --git a/lib/vision_inferrer.py b/lib/vision_inferrer.py index 0796d8f..938c2e6 100644 --- a/lib/vision_inferrer.py +++ b/lib/vision_inferrer.py @@ -47,41 +47,7 @@ def results_fully_populated(self, results, score): number_of_results = len(results) return (number_of_results >= 500 and score < 0.00000001) or number_of_results >= 5000 - # given an image object (usually coming from prepare_image_for_inference), an optional - # filter taxon, and an optional cache_key, calculate vision results for the image, within the - # filter taxon if provided, and save to a file if provided a cache_key - def process_image(self, image, filter_taxon_id=None, cache_key=None): - cache_path = self.cache_path_for_request(cache_key) - cached_results = self.cached_results(cache_path) - # vision results for this cache_key have already been generated, so use them - if cached_results is not None: - preds = cached_results - else: - preds = self.vision_model.predict(image, verbose=0)[0] - self.cache_results(cache_path, preds) - filtered_results = {} - filter_taxon = None if filter_taxon_id is None else self.taxonomy.taxa[filter_taxon_id] - sum_scores = 0 - # loop through vision results from highest score to lowest - for arg in preds.argsort()[::-1]: - score = preds[arg] - # if there are already enough results, or scores are getting too low, then stop - # NOTE: This may not be helpful and needs testing for accuracy and processing time - if self.results_fully_populated(filtered_results, score): - break - taxon_id = self.taxonomy.leaf_class_to_taxon[arg] - # ignore this result if the taxon is outside the requested filter taxon - if filter_taxon is not None: - result_taxon = self.taxonomy.taxa[taxon_id] - if not result_taxon.is_or_descendant_of(filter_taxon): - continue - filtered_results[taxon_id] = score - sum_scores += score - # normalize filtered scores to add to 1 - # NOTE: If there was a filter_taxon, or only a partial results list is returned (which is - # the case now while using results_fully_populated), then this modifies the raw vision - # scores. This ensures the scores of this method always sum to 1, even if the results have - # been filtered - for taxon_id in filtered_results: - filtered_results[taxon_id] = filtered_results[taxon_id] / sum_scores - return filtered_results + # given an image object (usually coming from prepare_image_for_inference), + # calculate vision results for the image + def process_image(self, image): + return self.vision_model(tf.convert_to_tensor(image), training=False)[0] diff --git a/lib/vision_testing.py b/lib/vision_testing.py index 57653a4..b45bb4d 100644 --- a/lib/vision_testing.py +++ b/lib/vision_testing.py @@ -6,10 +6,10 @@ import time import json import tensorflow as tf +from statistics import mean from PIL import Image from lib.test_observation import TestObservation from lib.inat_inferrer import InatInferrer -from lib.model_scoring import ModelScoring class VisionTesting: @@ -18,8 +18,10 @@ def __init__(self, config, **args): self.cmd_args = args self.inferrers = {} self.scores = {} - for score_type in ["matching_indices", "top1_distance_scores", - "top5_distance_scores", "top10_distance_scores"]: + score_types = ["matching_indices", "top1_distance_scores", + "top5_distance_scores", "top10_distance_scores", + "sum_ancestor_distance_scores", "average_ancestor_distance_scores"] + for score_type in score_types: self.scores[score_type] = { "vision": {}, "combined": {} @@ -29,8 +31,7 @@ def __init__(self, config, **args): print(json.dumps(model_config, indent=4)) model_name = model_config["name"] if "name" in model_config else f'Model {index}' model_config["name"] = model_name - for score_type in ["matching_indices", "top1_distance_scores", - "top5_distance_scores", "top10_distance_scores"]: + for score_type in score_types: self.scores[score_type]["vision"][index] = [] self.scores[score_type]["combined"][index] = [] @@ -52,11 +53,11 @@ def run(self): # if only one target observation was requested, test this row if it # matches the request, otherwise skip it if int(observation.observation_id) == target_observation_id: - self.test_observation(observation) - return + inferrer_results = self.test_observation(observation) else: continue - inferrer_results = self.test_observation(observation) + else: + inferrer_results = self.test_observation(observation) if inferrer_results is False: # there was some problem processing this test observation. Continue but # don't increment the counter so the requested number of observations @@ -87,15 +88,8 @@ def top_x_percent(self, x, scores): top_x = self.top_x(x, scores) return round((top_x / count) * 100, 2) - # sort results by score (x[1]) descending and return the first x results - def top_x_results(self, results, x): - top_x = dict(sorted( - results.items(), key=lambda x: x[1], reverse=True)[:x]) - return top_x - def print_scores(self): for index, inferrer in self.inferrers.items(): - print(inferrer.config["name"]) all_metrics = {} for method in ["vision", "combined"]: scores = self.scores["matching_indices"][method][index] @@ -118,11 +112,16 @@ def print_scores(self): (sum(top5_distance_scores) / metrics["count"]) * 100, 2) metrics["top10∆"] = round( (sum(top10_distance_scores) / metrics["count"]) * 100, 2) + metrics["avg∆"] = round( + (mean(self.scores["average_ancestor_distance_scores"][method][index]) / metrics["count"]) * 100, 2) + metrics["sum∆"] = round( + (mean(self.scores["sum_ancestor_distance_scores"][method][index]) / metrics["count"]) * 100, 2) all_metrics[method] = metrics print("method " + "\t" + "\t".join(all_metrics["vision"].keys())) for method in ["vision", "combined"]: - print(f"{method.ljust(10)}\t" + "\t".join( + stat_label = inferrer.config["name"] + "-" + self.cmd_args["label"] + "-" + method + print(f"{stat_label.ljust(10)}\t" + "\t".join( str(value) for value in all_metrics[method].values())) print("\n") @@ -133,19 +132,22 @@ def prepare_image_for_inference(self, cache_path): image = tf.image.decode_jpeg(image, channels=3) image = tf.image.convert_image_dtype(image, tf.float32) image = tf.image.central_crop(image, 0.875) - image = tf.image.resize(image, [299, 299]) + image = tf.image.resize(image, [299, 299], tf.image.ResizeMethod.NEAREST_NEIGHBOR) return tf.expand_dims(image, 0) def assess_top_results(self, observation, top_results): match_index = None distance_scores = [] - for index, key in enumerate(top_results): - if int(key) == int(observation.taxon_id): + for index, row in top_results.reset_index(drop=True).iterrows(): + if row["taxon_id"] == int(observation.taxon_id): match_index = index - # the taxa match, so the taxon distance score is 1 - distance_scores.append(1) - break - elif index < 10: + + if index < 10: + if row["taxon_id"] == int(observation.taxon_id): + # the taxa match, so the taxon distance score is 1 + distance_scores.append(1) + break + # if this is a top 10 result but not a match, append to taxon_scores # some measure of how far away this taxon is from the expected correct taxon using # (1 - [index of match in reversed target ancestry]/[lenth of target ancestry]) @@ -155,7 +157,7 @@ def assess_top_results(self, observation, top_results): # So the taxon score will be (1 - (3/8))^2, or (.625)^2, or 0.3090625 # NOTE: This is experimental and needs testing try: - taxon_match_index = observation.taxon_ancestry[::-1].index(key) + taxon_match_index = observation.taxon_ancestry[::-1].index(row["taxon_id"]) except ValueError: taxon_match_index = None if taxon_match_index: @@ -170,68 +172,105 @@ def test_observation(self, observation): cache_path = self.download_photo(observation.photo_url) if cache_path is None or not os.path.exists(cache_path): return False - try: - image = self.prepare_image_for_inference(cache_path) - except Exception as e: - print(f'\nObservation: {observation.observation_id} has a bad image') + if observation.lat == '' or observation.lng == '': return False - if observation.iconic_taxon_id == "" or self.cmd_args["filter_iconic"] is False: - iconic_taxon_id = None - else: + iconic_taxon_id = None + if observation.iconic_taxon_id != "" and self.cmd_args["filter_iconic"] is not False: iconic_taxon_id = int(observation.iconic_taxon_id) - # calculate vision scores + + inferrer_scores = {} for index, inferrer in self.inferrers.items(): - inferrer_scores[index] = {} - inferrer_scores[index]["vision"] = inferrer.vision_inferrer.process_image( - image, iconic_taxon_id, None) - - # calculate geo scores - if inferrer.geo_model and self.cmd_args["geo"]: - geo_scores = inferrer.geo_model.predict( - observation.lat, observation.lng, iconic_taxon_id) - inferrer_scores[index]["combined"] = ModelScoring.combine_vision_and_geo_scores( - inferrer_scores[index]["vision"], geo_scores) - else: - inferrer_scores[index]["combined"] = {} + lat = None + lng = None + filter_taxon = inferrer.lookup_taxon(iconic_taxon_id) + if inferrer.geo_elevation_model and self.cmd_args["geo"]: + lat = observation.lat + lng = observation.lng + try: + inferrer_scores[index] = inferrer.predictions_for_image( + cache_path, lat, lng, filter_taxon + ) + except Exception as e: + print(e) + print(f'\nError scoring observation {observation.observation_id}') + return False return inferrer_scores + def ancestor_distance_scores(self, observation, inferrer, results): + reversed_target_ancestors = observation.taxon_ancestry[::-1] + ancestor_distance_scores = [] + # for each top result + for index, row in results.iterrows(): + result_ancestors = inferrer.taxonomy.df.query( + f'left <= {row["left"]} and right >= {row["right"]}' + ).sort_values("left", ascending=False).reset_index(drop=True) + result_ancestor_match_index = None + # find the most specific taxon in the result's taxon's ancestry that is also in + # the target taxon's ancestry + for ancestor_index, ancestor_row in result_ancestors.iterrows(): + if ancestor_row["taxon_id"] in reversed_target_ancestors: + result_ancestor_match_index = ancestor_index + break + if result_ancestor_match_index is None: + result_ancestor_match_index = len(reversed_target_ancestors) + # calculate a score of how far from species the result matched the target + ancestor_distance_scores.append((1 - (result_ancestor_match_index / len(reversed_target_ancestors)))**2) + return ancestor_distance_scores + def append_to_aggregate_results(self, observation, inferrer_scores): vision_indices = set() + combined_indices = set() for index, results in inferrer_scores.items(): # only look at the top 100 results for this testing - top100_vision = self.top_x_results(results["vision"], 100) - top100_combined = self.top_x_results(results["combined"], 100) + + top100_vision = results.sort_values("vision_score", ascending=False).head(100) + top100_combined = results.sort_values("combined_score", ascending=False).head(100) + vision_index, vision_taxon_distance_scores = self.assess_top_results( observation, top100_vision) combined_index, combined_taxon_distance_scores = self.assess_top_results( observation, top100_combined) + + vision_ancestor_distance_scores = self.ancestor_distance_scores( + observation, self.inferrers[index], top100_vision.head(10)) + combined_ancestor_distance_scores = self.ancestor_distance_scores( + observation, self.inferrers[index], top100_combined.head(10)) + + self.scores["sum_ancestor_distance_scores"]["vision"][index].append( + sum(vision_ancestor_distance_scores)) + self.scores["average_ancestor_distance_scores"]["vision"][index].append( + mean(vision_ancestor_distance_scores)) + self.scores["sum_ancestor_distance_scores"]["combined"][index].append( + sum(combined_ancestor_distance_scores)) + self.scores["average_ancestor_distance_scores"]["combined"][index].append( + mean(combined_ancestor_distance_scores)) + vision_indices.add(vision_index) + combined_indices.add(combined_index) self.scores["matching_indices"]["vision"][index].append(vision_index) self.scores["matching_indices"]["combined"][index].append(combined_index) # top1 distance score is just the taxon_distance_score if the first result self.scores["top1_distance_scores"]["vision"][index].append( vision_taxon_distance_scores[0]) - self.scores["top1_distance_scores"]["combined"][index].append( - combined_taxon_distance_scores[0]) + if len(combined_taxon_distance_scores) > 0: + self.scores["top1_distance_scores"]["combined"][index].append( + combined_taxon_distance_scores[0]) # for taxon_distance, top n is the max score of the top n results, or the # taxon_distance_score of the most closely related taxon in the first n results self.scores["top5_distance_scores"]["vision"][index].append( max(vision_taxon_distance_scores[0:5])) - self.scores["top5_distance_scores"]["combined"][index].append( - max(combined_taxon_distance_scores[0:5])) - self.scores["top10_distance_scores"]["vision"][index].append( - max(vision_taxon_distance_scores[0:10])) - self.scores["top10_distance_scores"]["combined"][index].append( - max(combined_taxon_distance_scores[0:10])) - # if len(vision_indices) > 1: - # print(vision_indices) - # print(f'\nResults of Observation: {observation.observation_id}') - # for index, results in inferrer_results.items(): - # print(self.inferrers[index].config["name"]) - # results.print() - # print("===================================================\n") + if len(combined_taxon_distance_scores) > 0: + self.scores["top5_distance_scores"]["combined"][index].append( + max(combined_taxon_distance_scores[0:5])) + self.scores["top10_distance_scores"]["vision"][index].append( + max(vision_taxon_distance_scores[0:10])) + self.scores["top10_distance_scores"]["combined"][index].append( + max(combined_taxon_distance_scores[0:10])) + + # if len(combined_indices) > 1: + # print(f'Results of Observation: {observation.observation_id}: {combined_indices}') def download_photo(self, photo_url): checksum = hashlib.md5(photo_url.encode()).hexdigest() diff --git a/lib/web_forms.py b/lib/web_forms.py index 5bd09bf..153ba02 100644 --- a/lib/web_forms.py +++ b/lib/web_forms.py @@ -10,5 +10,8 @@ class ImageForm(FlaskForm): ]) lat = StringField("lat") lng = StringField("lng") + taxon_id = StringField("taxon_id") geomodel = StringField("geomodel") + aggregated = StringField("aggregated") + score_without_geo = StringField("score_without_geo") format = StringField("format") diff --git a/requirements.txt b/requirements.txt index 265debd..609548f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,6 @@ itsdangerous werkzeug WTForms flake8 -tensorflow +h3 +h3pandas +prison diff --git a/templates/home.html b/templates/home.html deleted file mode 100644 index 0d9f682..0000000 --- a/templates/home.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - Classify Image - - -
- - -
- - diff --git a/test_model.py b/test_model.py index 31d915f..4a555dc 100644 --- a/test_model.py +++ b/test_model.py @@ -7,6 +7,7 @@ @click.command() @click.option("--path", required=True, type=click.Path(), help="Path to test data CSV.") +@click.option("--label", required=True, type=str, help="Label used for output.") @click.option("--limit", type=int, show_default=True, default=100, help="Max number of observations to test.") @click.option("--geo/--no-geo", show_default=True, default=True,