diff --git a/export_data.py b/export_data.py new file mode 100644 index 0000000..96559b1 --- /dev/null +++ b/export_data.py @@ -0,0 +1,30 @@ +import click +import json + + +@click.command() +@click.option("--exclude-train-photos-path", "-x", multiple=True, + help="Exclude photos that were included in these training sets.") +@click.option("--limit", type=int, show_default=True, default=5000, + help="Number of observations to include.") +@click.option("--standard_set", help="Export the standard set of 18 filtered datasets.") +@click.option("--filename_suffix", type=str, help="String to add to end of filename.") +@click.option("--place_id", type=int, help="Export observations in this place.") +@click.option("--taxon_id", type=int, help="Export observations in this taxon.") +def test(**args): + # some libraries are slow to import, so wait until command is validated and properly invoked + from lib.model_test_data_exporter import ModelTestDataExporter + print("\nArguments:") + print(json.dumps(args, indent=4)) + print("\nInitializing ModelTestDataExporter...\n") + model_test_data_exporter = ModelTestDataExporter(**args) + print("Exporting data...\n") + if "standard_set" in args and args["standard_set"]: + model_test_data_exporter.generate_standard_set() + else: + model_test_data_exporter.generate_from_cmd_args() + print("\nDone\n") + + +if __name__ == "__main__": + test() diff --git a/generate_thresholds.py b/generate_thresholds.py index 2d3bad4..3f0bd14 100644 --- a/generate_thresholds.py +++ b/generate_thresholds.py @@ -3,229 +3,347 @@ """ import argparse -from tqdm.auto import tqdm +import tifffile +import os import pandas as pd -import h3pandas import numpy as np +import h3 +import h3pandas +import tensorflow as tf +import csv import math +import json +from tqdm.auto import tqdm +import tensorflow as tf from sklearn.metrics import precision_recall_curve -from sklearn.metrics import auc -import sys +import matplotlib.pyplot as plt +import warnings -def ratios_for_taxon_id(taxon_id, all_spatial_grid_counts, train_df_h3): - - #counts in each presence cell for target taxon - target_spatial_grid_counts = train_df_h3[train_df_h3.taxon_id==taxon_id].index.value_counts() +class ResLayer(tf.keras.layers.Layer): + def __init__(self): + super(ResLayer, self).__init__() + self.w1 = tf.keras.layers.Dense( + 256, activation="relu", kernel_initializer="he_normal" + ) + self.w2 = tf.keras.layers.Dense( + 256, activation="relu", kernel_initializer="he_normal" + ) + self.dropout = tf.keras.layers.Dropout(rate=0.5) + self.add = tf.keras.layers.Add() + + def call(self, inputs): + x = self.w1(inputs) + x = self.dropout(x) + x = self.w2(x) + x = self.add([x, inputs]) + return x + + def get_config(self): + return {} + +class Taxon: + + def __init__(self, row): + for key in row: + setattr(self, key, row[key]) + + def set(self, attr, val): + setattr(self, attr, val) + + def is_or_descendant_of(self, taxon): + if self.id == taxon.id: + return True + return self.descendant_of(taxon) + + # using the nested set left and right values, a taxon is a descendant of another + # as long as its left is higher and its right is lower + def descendant_of(self, taxon): + return self.left > taxon.left and self.right < taxon.right + +class ModelTaxonomy: + + def __init__(self, path): + self.load_mapping(path) + self.assign_nested_values() + + def load_mapping(self, path): + self.node_key_to_leaf_class_id = {} + self.leaf_class_to_taxon = {} + # there is no taxon with ID 0, but roots of the taxonomy with have a parent ID of 0, + # so create a fake taxon of Life to represent the root of the entire tree + self.taxa = {0: Taxon({"name": "Life", "depth": 0})} + self.taxon_children = {} + try: + with open(path) as csv_file: + csv_reader = csv.DictReader(csv_file, delimiter=",") + for row in csv_reader: + taxon_id = int(row["taxon_id"]) + rank_level = float(row["rank_level"]) + leaf_class_id = int(row["leaf_class_id"]) if row["leaf_class_id"] else None + parent_id = int(row["parent_taxon_id"]) if row["parent_taxon_id"] else 0 + # some taxa are not leaves and aren't represented in the leaf layer + if leaf_class_id is not None: + self.node_key_to_leaf_class_id[taxon_id] = leaf_class_id + self.leaf_class_to_taxon[leaf_class_id] = taxon_id + self.taxa[taxon_id] = Taxon({ + "id": taxon_id, + "name": row["name"], + "parent_id": parent_id, + "leaf_class_id": leaf_class_id, + "rank_level": rank_level + }) + if parent_id not in self.taxon_children: + self.taxon_children[parent_id] = [] + self.taxon_children[parent_id].append(taxon_id) + except IOError as e: + print(e) + print(f"\n\nCannot open mapping file `{path}`\n\n") + raise e + + # prints to the console a representation of this tree + def print(self, taxon_id=0, ancestor_prefix=""): + children = self.taxon_children[taxon_id] + index = 0 + for child_id in children: + last_in_branch = (index == len(children) - 1) + index += 1 + icon = "└──" if last_in_branch else "├──" + prefixIcon = " " if last_in_branch else "│ " + taxon = self.taxa[child_id] + print(f'{ancestor_prefix}{icon}{taxon.name} :: {taxon.left}:{taxon.right}') + if child_id in self.taxon_children: + self.print(child_id, f"{ancestor_prefix}{prefixIcon}") + + # calculated nested set left and right values and depth representing how many nodes + # down the taxon is from Life. These can be later used for an efficient way to calculate + # if a taxon is a descendant of another + def assign_nested_values(self, taxon_id=0, index=0, depth=1, ancestors=[]): + for child_id in self.taxon_children[taxon_id]: + self.taxa[child_id].set("left", index) + self.taxa[child_id].set("depth", depth) + self.taxa[child_id].set("ancestors", ancestors) + index += 1 + if child_id in self.taxon_children: + child_ancestors = ancestors + [child_id] + index = self.assign_nested_values(child_id, index, depth + 1, child_ancestors) + self.taxa[child_id].set("right", index) + index += 1 + return index + + +class TFGeoPriorModelEnv: + + def __init__(self, model, taxonomy): + self.taxonomy = taxonomy + # initialize the geo model for inference + self.gpmodel = tf.keras.models.load_model( + model, + custom_objects={'ResLayer': ResLayer}, + compile=False + ) - #background count per target taxon count in target taxon presence cells - counts = pd.DataFrame({ - "target": target_spatial_grid_counts, - "all": all_spatial_grid_counts, - }) - counts = counts.fillna(0) - counts["target_ratio"] = counts["all"]/counts["target"] - counts.replace([np.inf, -np.inf], np.nan, inplace=True) - good_counts = counts["target_ratio"].dropna() - return good_counts - -def pseudo_absences_for_taxon_id( - taxon_id, - all_spatial_grid_counts, - train_df_h3, - test_df_h3, - full_spatial_data_lookup_table - ): - - #count cutoff is mean background count per target taxon count in target taxon presence cells - count_cutoff = ratios_for_taxon_id(taxon_id, all_spatial_grid_counts, train_df_h3).mean() - cutoff_grid_cell_indices = set(all_spatial_grid_counts[all_spatial_grid_counts>count_cutoff].index) - #absence candidates from test - ilocs = np.unique(np.random.randint(0, len(test_df_h3), 10_000)) - sample_occupancy_pseudoabsences = [] - for i in ilocs: - row = test_df_h3.iloc[i] - - if row.name in cutoff_grid_cell_indices: - if taxon_id in full_spatial_data_lookup_table.loc[row.name].taxon_id: - #candidate in cell containing target taxon so can't be absence - pass - else: - #candidate in cell not containing target taxon and - #count is greater than count cutoff. Use as absence - sample_occupancy_pseudoabsences.append( - (row.lat, row.lng) - ) + def features_for_one_class_elevation(self, latitude, longitude, elevation): + """Evalutes the model for a single class and multiple locations + + Args: + latitude (list): A list of latitudes + longitude (list): A list of longitudes (same length as latitude) + elevation (list): A list of elevations (same length as latitude) + class_of_interest (int): The single class to eval + + Returns: + numpy array: scores for class of interest at each location + """ + def encode_loc(latitude, longitude, elevation): + latitude = np.array(latitude) + longitude = np.array(longitude) + elevation = np.array(elevation) + elevation = elevation.astype("float32") + grid_lon = longitude.astype('float32') / 180.0 + grid_lat = latitude.astype('float32') / 90.0 + + elevation[elevation>0] = elevation[elevation>0]/6574.0 + elevation[elevation<0] = elevation[elevation<0]/32768.0 + norm_elev = elevation + + if np.isscalar(grid_lon): + grid_lon = np.array([grid_lon]) + if np.isscalar(grid_lat): + grid_lat = np.array([grid_lat]) + if np.isscalar(norm_elev): + norm_elev = np.array([norm_elev]) + + norm_loc = tf.stack([grid_lon, grid_lat], axis=1) + + encoded_loc = tf.concat([ + tf.sin(norm_loc * math.pi), + tf.cos(norm_loc * math.pi), + tf.expand_dims(norm_elev, axis=1), + + ], axis=1) + + return encoded_loc + + encoded_loc = encode_loc(latitude, longitude, elevation) + loc_emb = self.gpmodel.layers[0](encoded_loc) + + # res layers - feature extraction + x = self.gpmodel.layers[1](loc_emb) + x = self.gpmodel.layers[2](x) + x = self.gpmodel.layers[3](x) + x = self.gpmodel.layers[4](x) - #limit number of absences - if len(sample_occupancy_pseudoabsences) >= 500: - break + # process just the one class + return x + + def eval_one_class_elevation_from_features(self, x, class_of_interest): + return tf.keras.activations.sigmoid( + tf.matmul( + x, + tf.expand_dims(self.gpmodel.layers[5].weights[0][:,class_of_interest], axis=0), + transpose_b=True + ) + ).numpy() + +def ignore_shapely_deprecation_warning(message, category, filename, lineno, file=None, line=None): + if "array interface is deprecated" in str(message): + return None + return warnings.defaultaction(message, category, filename, lineno, file, line) + +def main(args): + print("loading in the model...") + mt = ModelTaxonomy(args.taxonomy) + tfgpm = TFGeoPriorModelEnv(args.model, mt) - sample_occupancy_pseudoabsences = pd.DataFrame( - sample_occupancy_pseudoabsences, - columns=["lat", "lng"] + print("setting up the map...") + warnings.showwarning = ignore_shapely_deprecation_warning + im = tifffile.imread(args.elevation) + im_df = pd.DataFrame(im) + im_df.index = np.linspace(90, -90, 2160) + im_df.columns = np.linspace(-180, 180, 4320) + im_df = im_df.reset_index() + im_df = im_df.melt( + id_vars=["index"], ) - sample_occupancy_pseudoabsences["taxon_id"] = taxon_id - return sample_occupancy_pseudoabsences - -def get_threshold(test, predictions): - - precision, recall, thresholds = precision_recall_curve(test, predictions) - p1 = (2 * precision * recall) - p2 = (precision + recall) - out = np.zeros( (len(p1)) ) - fscore = np.divide(p1,p2, out=out, where=p2!=0) - index = np.argmax(fscore) - thresholdOpt = round(thresholds[index], ndigits = 4) - prauc = auc(recall, precision) - return thresholdOpt, prauc - -def get_predictions(latitude,longitude,taxon_id, mt, tfgpm): - - class_of_interest = mt.node_key_to_leaf_class_id[taxon_id] - predictions = tfgpm.eval_one_class( - np.array(latitude), - np.array(longitude), - class_of_interest + im_df.columns = ["lat", "lng", "elevation"] + elev_dfh3 = im_df.h3.geo_to_h3(args.h3_resolution) + elev_dfh3 = elev_dfh3.drop( + columns=['lng', 'lat'] + ).groupby("h3_0"+str(args.h3_resolution)).mean() + gdfk = elev_dfh3.h3.h3_to_geo() + gdfk["lng"] = gdfk["geometry"].x + gdfk["lat"] = gdfk["geometry"].y + _ = gdfk.pop("geometry") + gdfk = gdfk.rename_axis('h3index') + + print("making features...") + feats = tfgpm.features_for_one_class_elevation( + latitude=list(gdfk.lat), + longitude=list(gdfk.lng), + elevation=list(gdfk.elevation) ) - return predictions -def get_predictions_py(latitude,longitude,taxon_id, net_params, model, torch): - - class_of_interest = net_params["params"]["class_to_taxa"].index(taxon_id) - grid_lon = torch.from_numpy(longitude/180) - grid_lat = torch.from_numpy(latitude/90) - grid_lon = grid_lon.repeat(1,1).unsqueeze(2) - grid_lat = grid_lat.repeat(1, 1).unsqueeze(2) - loc_ip = torch.cat((grid_lon, grid_lat), 2) - feats = torch.cat((torch.sin(math.pi*loc_ip), torch.cos(math.pi*loc_ip)), 2) - model.eval() - with torch.no_grad(): - pred = model(feats, class_of_interest=class_of_interest) - predictions = pred.cpu().numpy()[0] - return predictions - -def main(args): - - print("loading in test and train data...") - test_df = pd.read_csv(args.test_spatial_data_path, - usecols=["taxon_id","latitude","longitude"]).rename({ - "latitude": "lat", - "longitude": "lng" - }, axis=1) - train_df = pd.read_csv(args.train_spatial_data_path, - usecols=["taxon_id","latitude","longitude"]).rename({ + print("loading in the training data...") + train_df = pd.read_csv(args.train_spatial_data, + usecols=["taxon_id","latitude","longitude","captive"]).rename({ "latitude": "lat", "longitude": "lng" }, axis=1) - taxon_ids = test_df.taxon_id.unique() - if args.stop_after is not None: - taxon_ids = taxon_ids[0:args.stop_after] - - print("calculating absences...") - test_df_h3 = test_df.h3.geo_to_h3(args.h3_resolution) + train_df = train_df[train_df.captive==0] #no-CID ok, wild only + train_df.drop(["captive"],axis=1) train_df_h3 = train_df.h3.geo_to_h3(args.h3_resolution) - full_spatial_data_lookup_table = pd.concat([train_df_h3, test_df_h3]).pivot_table( - index="h3_0"+str(args.h3_resolution), - values="taxon_id", - aggfunc=set - ) - pseudoabsence_df = pd.DataFrame() all_spatial_grid_counts = train_df_h3.index.value_counts() - for taxon_id in tqdm(taxon_ids): - pa = pseudo_absences_for_taxon_id( - taxon_id, - all_spatial_grid_counts, - train_df_h3, - test_df_h3, - full_spatial_data_lookup_table - ) - pseudoabsence_df = pd.concat([pseudoabsence_df, pa]) - - print("calculating thresholds...") - if args.model_type == "tf": - from lib.taxon import Taxon - from lib.model_taxonomy import ModelTaxonomy - from lib.tf_gp_model import TFGeoPriorModel - mt = ModelTaxonomy(args.taxonomy_path) - tfgpm = TFGeoPriorModel(args.model_path, mt) - elif args.model_type == "pytorch": - import torch - sys.path.append("../geo_prior_inat") - from geo_prior import models - net_params = torch.load(args.model_path, map_location="cpu") - net_params["params"]['device'] = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - model_name = models.select_model(net_params["params"]["model"]) - model = model_name(num_inputs=net_params["params"]['num_feats'], - num_classes=net_params["params"]['num_classes'], - num_filts=net_params["params"]['num_filts'], - num_users=net_params["params"]['num_users'], - num_context=net_params["params"]['num_context']).to(net_params["params"]['device']) - model.load_state_dict(net_params['state_dict']) + presence_absence = pd.DataFrame({ + "background": all_spatial_grid_counts, + }) + presence_absence = presence_absence.fillna(0) + + print("...looping through taxa") output = [] + taxa = pd.read_csv(args.taxonomy, usecols=["taxon_id","leaf_class_id","iconic_class_id"]).dropna(subset=['leaf_class_id']) + taxon_ids = taxa.taxon_id + if args.stop_after is not None: + taxon_ids = taxon_ids[0:args.stop_after] + desired_recall = 0.95 + resolution = args.h3_resolution + area = h3.hex_area(resolution) for taxon_id in tqdm(taxon_ids): - absences = pseudoabsence_df[pseudoabsence_df.taxon_id==taxon_id] - presences = test_df[test_df.taxon_id==taxon_id] - test = np.concatenate(([1] * len(presences), [0] * len(absences))) - - longitude = np.concatenate(( - presences.lng.values.astype('float32'), - absences.lng.values.astype('float32') - )) - - latitude = np.concatenate(( - presences.lat.values.astype('float32'), - absences.lat.values.astype('float32') - )) - - if args.model_type == "tf": - predictions = get_predictions(latitude, longitude, taxon_id, mt, tfgpm) - elif args.model_type == "pytorch": - predictions = get_predictions_py(latitude, longitude, taxon_id, net_params, model, torch) - threshold, prauc = get_threshold(test, predictions) - + try: + class_of_interest = mt.node_key_to_leaf_class_id[taxon_id] + except: + print('not in the model for some reason') + continue + + #get predictions + preds = tfgpm.eval_one_class_elevation_from_features(feats, class_of_interest) + gdfk["pred"] = tf.squeeze(preds).numpy() + + #make presence absence dataset + target_spatial_grid_counts = train_df_h3[train_df_h3.taxon_id==taxon_id].index.value_counts() + presences = gdfk.loc[target_spatial_grid_counts.index]["pred"] + if len(presences) == 0: + print("not present") + continue + + #calculate threhold + presence_absence["forground"] = target_spatial_grid_counts + presence_absence["predictions"] = gdfk["pred"] + presence_absence.forground = presence_absence.forground.fillna(0) + yield_cutoff = np.percentile((presence_absence["background"]/presence_absence["forground"])[presence_absence["forground"]>0], 95) + absences = presence_absence[(presence_absence["forground"]==0) & (presence_absence["background"] > yield_cutoff)]["predictions"] + presences = presence_absence[(presence_absence["forground"]>0)]["predictions"] + df_x = pd.DataFrame({'predictions': presences, 'test': 1}) + df_y = pd.DataFrame({'predictions': absences, 'test': 0}) + for_thres = pd.concat([df_x, df_y], ignore_index=False) + precision, recall, thresholds = precision_recall_curve(for_thres.test, for_thres.predictions) + p1 = (2 * precision * recall) + p2 = (precision + recall) + out = np.zeros( (len(p1)) ) + fscore = np.divide(p1,p2, out=out, where=p2!=0) + index = np.argmax(fscore) + thres = thresholds[index] + + #store daa row = { "taxon_id": taxon_id, - "threshold": threshold, - "auc": prauc, - "num_presences": len(presences), - "num_absences": len(absences) + "thres": thres, + "area": len(gdfk[gdfk.pred >= thres])*area } row_dict = dict(row) output.append(row_dict) - + print("writing output...") - pd.DataFrame(output).to_csv(args.output_path) + output_pd = pd.DataFrame(output) + output_pd.to_csv(args.output_dir+"/thresholds.csv") if __name__ == "__main__": info_str = '\nrun as follows\n' + \ - ' python generate_thresholds.py --model_type tf \n' + \ - ' --model_path tf_geoprior_1_4_r6.h5 \n' + \ - ' --taxonomy_path taxonomy_1_4.csv\n' + \ - ' --test_spatial_data_path test_spatial_data.csv\n' + \ - ' --train_spatial_data_path train_spatial_data.csv\n' + \ - ' --output_path thresholds.csv\n' + \ - ' --h3_resolution 3\n' + \ + ' python generate_thresholds.py --elevation wc2.1_5m_elev.tif \n' + \ + ' --model v2_6/tf_geoprior_2_5_r6_elevation.h5 \n' + \ + ' --taxonomy taxonomy_1_4.csv\n' + \ + ' --train_spatial_data v2_6/taxonomy.csv\n' + \ + ' --output_dir v2_6\n' + \ + ' --h3_resolution 4\n' + \ ' --stop_after 10\n' parser = argparse.ArgumentParser(usage=info_str) - parser.add_argument('--model_type', type=str, - help='Can be either "tf" or "pytorch".', required=True) - parser.add_argument('--model_path', type=str, + parser.add_argument('--elevation', type=str, + help='Path to elev tif.', required=True) + parser.add_argument('--model', type=str, help='Path to tf model.', required=True) - parser.add_argument('--taxonomy_path', type=str, + parser.add_argument('--taxonomy', type=str, help='Path to taxonomy csv.', required=True) - parser.add_argument('--test_spatial_data_path', type=str, - help='Path to test csv for presence/absences.', required=True) - parser.add_argument('--train_spatial_data_path', type=str, + parser.add_argument('--train_spatial_data', type=str, help='Path to train csv for occupancy.', required=True) - parser.add_argument('--output_path', type=str, - help='Path to write thesholds.', required=True) - parser.add_argument('--h3_resolution', type=int, default=3, - help='grid resolution from 0 - 15, lower numbers are coarser/faster. Recommend 3, 4, or 5') + parser.add_argument('--output_dir', type=str, + help='directory to write thesholds.', required=True) + parser.add_argument('--h3_resolution', type=int, default=4, + help='grid resolution from 0 - 15, lower numbers are coarser/faster. Currently using 4') parser.add_argument('--stop_after', type=int, - help='just run the first x taxa') + help='just run the first x taxa') args = parser.parse_args() main(args) + \ No newline at end of file diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index 8e155f9..1a6fb67 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -1,11 +1,22 @@ -import os +import time import magic import tensorflow as tf +import pandas as pd +import h3 +import h3pandas # noqa: F401 +import math +import os +import tifffile +import numpy as np from PIL import Image -from lib.pt_geo_prior_model import PTGeoPriorModel -from lib.tf_gp_model import TFGeoPriorModel +from lib.tf_gp_elev_model import TFGeoPriorModelElev from lib.vision_inferrer import VisionInferrer -from lib.model_taxonomy import ModelTaxonomy +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe + +# TODO: better way to address the SettingWithCopyWarning warning? +pd.options.mode.chained_assignment = None + +MINIMUM_GEO_SCORE = 0.005 class InatInferrer: @@ -14,35 +25,367 @@ def __init__(self, config): self.config = config self.setup_taxonomy(config) self.setup_vision_model(config) + self.setup_elevation_dataframe(config) self.setup_geo_model(config) self.upload_folder = "static/" def setup_taxonomy(self, config): - self.taxonomy = ModelTaxonomy(config["taxonomy_path"]) + self.taxonomy = ModelTaxonomyDataframe( + config["taxonomy_path"], + config["tf_elev_thresholds"] if "tf_elev_thresholds" in config else None + ) def setup_vision_model(self, config): self.vision_inferrer = VisionInferrer(config["vision_model_path"], self.taxonomy) + def setup_elevation_dataframe(self, config): + self.geo_elevation_cells = None + # load elevation data stored at H3 resolution 4 + if "elevation_h3_r4" in config: + self.geo_elevation_cells = pd.read_csv(config["elevation_h3_r4"]). \ + sort_values("h3_04").set_index("h3_04").sort_index() + self.geo_elevation_cells = InatInferrer.add_lat_lng_to_h3_geo_dataframe(self.geo_elevation_cells) + + def setup_elevation_dataframe_from_worldclim(self, config, resolution): + # preventing from processing at too high a resolution + if resolution > 5: + return + if "wc2.1_5m_elev_2.tif" in config: + im = tifffile.imread(config["wc2.1_5m_elev_2.tif"]) + im_df = pd.DataFrame(im) + im_df.index = np.linspace(90, -90, 2160) + im_df.columns = np.linspace(-180, 180, 4320) + im_df = im_df.reset_index() + im_df = im_df.melt(id_vars=["index"]) + im_df.columns = ["lat", "lng", "elevation"] + elev_dfh3 = im_df.h3.geo_to_h3(resolution) + elev_dfh3 = elev_dfh3.drop(columns=["lng", "lat"]).groupby(f'h3_0{resolution}').mean() + def setup_geo_model(self, config): - if "use_pt_gp_model" in config and config["use_pt_gp_model"] and "pt_geo_model_path" in config: - self.geo_model = PTGeoPriorModel(config["pt_geo_model_path"], self.taxonomy) - elif "tf_geo_model_path" in config: - self.geo_model = TFGeoPriorModel(config["tf_geo_model_path"], self.taxonomy) - else: - self.geo_model = None + self.geo_elevation_model = None + self.geo_model_features = None + if "tf_geo_elevation_model_path" in config and self.geo_elevation_cells is not None: + self.geo_elevation_model = TFGeoPriorModelElev(config["tf_geo_elevation_model_path"]) + self.geo_model_features = self.geo_elevation_model.features_for_one_class_elevation( + latitude=list(self.geo_elevation_cells.lat), + longitude=list(self.geo_elevation_cells.lng), + elevation=list(self.geo_elevation_cells.elevation) + ) - def prepare_image_for_inference(self, file_path, image_uuid): + def prepare_image_for_inference(self, file_path): mime_type = magic.from_file(file_path, mime=True) # attempt to convert non jpegs if mime_type != "image/jpeg": im = Image.open(file_path) - rgb_im = im.convert("RGB") - file_path = os.path.join(self.upload_folder, image_uuid) + ".jpg" - rgb_im.save(file_path) - - image = tf.io.read_file(file_path) - image = tf.image.decode_jpeg(image, channels=3) + image = im.convert("RGB") + else: + image = tf.io.read_file(file_path) + image = tf.image.decode_jpeg(image, channels=3) image = tf.image.convert_image_dtype(image, tf.float32) image = tf.image.central_crop(image, 0.875) image = tf.image.resize(image, [299, 299], tf.image.ResizeMethod.NEAREST_NEIGHBOR) return tf.expand_dims(image, 0) + + def vision_predict(self, image, debug=False): + if debug: + start_time = time.time() + vision_scores = self.vision_inferrer.process_image(image) + if debug: + print("Vision Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return vision_scores + + def geo_model_predict(self, lat, lng, debug=False): + if debug: + start_time = time.time() + if lat is None or lat == "" or lng is None or lng == "": + return None + if self.geo_elevation_model is None: + return None + # lookup the H3 cell this lat lng occurs in + h3_cell = h3.geo_to_h3(float(lat), float(lng), 4) + h3_cell_centroid = h3.h3_to_geo(h3_cell) + # get the average elevation of the above H3 cell + elevation = self.geo_elevation_cells.loc[h3_cell].elevation + geo_scores = self.geo_elevation_model.predict( + h3_cell_centroid[0], h3_cell_centroid[1], float(elevation)) + if debug: + print("Geo Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return geo_scores + + def lookup_taxon(self, taxon_id): + if taxon_id is None: + return None + try: + return self.taxonomy.df.loc[taxon_id] + except Exception as e: + print(f'taxon `{taxon_id}` does not exist in the taxonomy') + raise e + + def predictions_for_image(self, file_path, lat, lng, filter_taxon, score_without_geo=False, + debug=False): + if debug: + start_time = time.time() + image = self.prepare_image_for_inference(file_path) + raw_vision_scores = self.vision_predict(image, debug) + raw_geo_scores = self.geo_model_predict(lat, lng, debug) + top100 = self.combine_results(raw_vision_scores, raw_geo_scores, filter_taxon, + score_without_geo, debug) + if debug: + print("Prediction Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return top100 + + def combine_results(self, raw_vision_scores, raw_geo_scores, filter_taxon, + score_without_geo=False, debug=False): + if debug: + start_time = time.time() + no_geo_scores = (raw_geo_scores is None) + + # make a copy of the model taxonomy leaf nodes to be used for storing results. Skip any + # filtering at this stage as the taxonomy dataframe needs to have the same number of + # leaf taxa and in the same order as the raw scores + leaf_scores = self.taxonomy.leaf_df.copy() + # add a column for vision scores + leaf_scores["vision_score"] = raw_vision_scores + # add a column for geo scores + leaf_scores["geo_score"] = 0 if no_geo_scores else raw_geo_scores + # set a lower limit for geo scores if there are any + leaf_scores["normalized_geo_score"] = 0 if no_geo_scores \ + else leaf_scores["geo_score"].clip(MINIMUM_GEO_SCORE, None) + + # if filtering by a taxon, restrict results to that taxon and its descendants + if filter_taxon is not None: + # using nested set left and right values, select the filter taxon and its descendants + leaf_scores = leaf_scores.query( + f'left >= {filter_taxon["left"]} and right <= {filter_taxon["right"]}' + ) + # normalize the vision scores so they add up to 1 after filtering + sum_of_vision_scores = leaf_scores["vision_score"].sum() + leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] / sum_of_vision_scores + else: + # when not filtering by a taxon, the normalized vision score is the same as the original + leaf_scores["normalized_vision_score"] = leaf_scores["vision_score"] + + if no_geo_scores or score_without_geo: + # if there are no geo scores, or it was requested to not use geo scores to affect + # the final combined score, set the combined scores to be the same as the vision scores + leaf_scores["combined_score"] = leaf_scores["normalized_vision_score"] + else: + # the combined score is simply the normalized vision score + # multipliedby the normalized geo score + leaf_scores["combined_score"] = leaf_scores["normalized_vision_score"] * \ + leaf_scores["normalized_geo_score"] + if debug: + print("Score Combining Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + return leaf_scores + + def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False, debug=False): + if debug: + start_time = time.time() + + no_geo_scores = (leaf_scores["geo_score"].max() == 0) + # make a copy of the full taxonomy including non-leaves to be used for storing results + if filter_taxon is not None: + # using nested set left and right values, select the filter taxon, + # its descendants, and its ancestors + all_node_scores = self.taxonomy.df.query( + f'(left >= {filter_taxon["left"]} and right <= {filter_taxon["right"]}) or' + + f'(left < {filter_taxon["left"]} and right > {filter_taxon["right"]})' + ).copy().reset_index(drop=True) + else: + all_node_scores = self.taxonomy.df.copy().reset_index(drop=True) + + # copy the score columns from the already-calculated leaf scores + all_node_scores = pd.merge(all_node_scores, leaf_scores[[ + "taxon_id", "vision_score", "normalized_vision_score", "geo_score", + "normalized_geo_score"]], on="taxon_id", how="left").set_index("taxon_id", drop=False) + + # calculate the highest combined score + top_combined_score = leaf_scores.sort_values( + "combined_score", ascending=False).head(1)["combined_score"].values[0] + lower_cutoff_threshold = 0.0001 + # determine a lower-bound cutoff where results with combined scores below this cutoff + # will be ignored. This isn't necessary for scoring, but it improves performance + # TODO: evaluate this + cutoff = max([0.00001, top_combined_score * lower_cutoff_threshold]) + + aggregated_scores = {} + # restrict score aggregation to results where the combined score is above the cutoff + scores_to_aggregate = leaf_scores.query(f'combined_score > {cutoff}') + # loop through all results where the combined score is above the cutoff + for taxon_id, vision_score, geo_score, geo_threshold in zip( + scores_to_aggregate["taxon_id"], + scores_to_aggregate["normalized_vision_score"], + scores_to_aggregate["normalized_geo_score"], + scores_to_aggregate["geo_threshold"] + ): + # loop through the pre-calculated ancestors of this result's taxon + for ancestor_taxon_id in self.taxonomy.taxon_ancestors[taxon_id]: + # set default values for the ancestor the first time it is referenced + if ancestor_taxon_id not in aggregated_scores: + aggregated_scores[ancestor_taxon_id] = {} + aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] = 0 + if not no_geo_scores: + aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = 0 + aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 100 + # aggregated vision score is a sum of descendant scores + aggregated_scores[ancestor_taxon_id]["aggregated_vision_score"] += vision_score + if not no_geo_scores and geo_score > aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"]: + # aggregated geo score is the max of descendant geo scores + aggregated_scores[ancestor_taxon_id]["aggregated_geo_score"] = geo_score + if not no_geo_scores and \ + aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] != 0 and \ + geo_score > geo_threshold: + # aggregated geo threshold is set to 0 if any descendants are above their threshold + aggregated_scores[ancestor_taxon_id]["aggregated_geo_threshold"] = 0 + + # turn the aggregated_scores dict into a data frame + scores_df = pd.DataFrame.from_dict(aggregated_scores, orient="index") + # merge the aggregated scores into the scoring taxonomy + all_node_scores = all_node_scores.join(scores_df) + + # the aggregated scores of leaves will be NaN, so populate them with their original scores + all_node_scores["aggregated_vision_score"].fillna( + all_node_scores["normalized_vision_score"], inplace=True) + if no_geo_scores: + all_node_scores["aggregated_geo_score"] = 0 + all_node_scores["aggregated_geo_threshold"] = 0 + else: + all_node_scores["aggregated_geo_score"].fillna( + all_node_scores["normalized_geo_score"], inplace=True) + all_node_scores["aggregated_geo_threshold"].fillna( + all_node_scores["geo_threshold"], inplace=True) + + if (no_geo_scores or score_without_geo): + # if there are no geo scores, or it was requested to not use geo scores to affect + # the final combined score, set the combined scores to be the same as the vision scores + all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] + else: + # the combined score is simply the normalized vision score + # multipliedby the normalized geo score + all_node_scores["aggregated_combined_score"] = all_node_scores["aggregated_vision_score"] * \ + all_node_scores["aggregated_geo_score"] + + # calculate a normalized combined score so all values add to 1, to be used for thresholding + sum_of_root_node_aggregated_combined_scores = all_node_scores.query( + "parent_taxon_id.isnull()")["aggregated_combined_score"].sum() + all_node_scores["normalized_aggregated_combined_score"] = all_node_scores[ + "aggregated_combined_score"] / sum_of_root_node_aggregated_combined_scores + + if debug: + print("Aggregation Time: %0.2fms" % ((time.time() - start_time) * 1000.)) + thresholded_results = all_node_scores.query("normalized_aggregated_combined_score > 0.05") + print("\nTree of aggregated results:") + ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=( + lambda row: f'{row.name} [' + + f'V:{round(row.aggregated_vision_score, 4)}, ' + + f'G:{round(row.aggregated_geo_score, 4)}, ' + + f'C:{round(row.aggregated_combined_score, 4)}, ' + + f'NC:{round(row.normalized_aggregated_combined_score, 4)}]')) + print("") + return all_node_scores + + def h3_04_geo_results_for_taxon(self, taxon_id, bounds=[], thresholded=False, raw_results=False): + if (self.geo_elevation_cells is None) or (self.geo_elevation_model is None): + return + try: + taxon = self.taxonomy.df.loc[taxon_id] + except Exception as e: + print(f'taxon `{taxon_id}` does not exist in the taxonomy') + raise e + if math.isnan(taxon["leaf_class_id"]): + return + + geo_scores = self.geo_elevation_model.eval_one_class_elevation_from_features( + self.geo_model_features, int(taxon["leaf_class_id"])) + geo_score_cells = self.geo_elevation_cells.copy() + geo_score_cells["geo_score"] = tf.squeeze(geo_scores).numpy() + if thresholded: + geo_score_cells = geo_score_cells.query(f'geo_score > {taxon["geo_threshold"]}') + else: + # return scores more than 10% of the taxon threshold, or more than 0.0001, whichever + # is smaller. This reduces data needed to be redendered client-side for the Data Layer + # mapping approach, and maybe can be removed once switching to map tiles + lower_bound_score = np.array([0.0001, taxon["geo_threshold"] / 10]).min() + geo_score_cells = geo_score_cells.query(f'geo_score > {lower_bound_score}') + + if bounds: + min = geo_score_cells["geo_score"].min() + max = geo_score_cells["geo_score"].max() + geo_score_cells = InatInferrer.filter_geo_dataframe_by_bounds(geo_score_cells, bounds) + # perform a log transform on the scores based on the min/max score for the unbounded set + geo_score_cells["geo_score"] = \ + (np.log10(geo_score_cells["geo_score"]) - math.log10(min)) / \ + (math.log10(max) - math.log10(min)) + + if raw_results: + return geo_score_cells + return dict(zip(geo_score_cells.index.astype(str), geo_score_cells["geo_score"])) + + def h3_04_taxon_range(self, taxon_id, bounds=[]): + taxon_range_path = os.path.join(self.config["taxon_ranges_path"], f'{taxon_id}.csv') + if not os.path.exists(taxon_range_path): + return None + taxon_range_df = pd.read_csv(taxon_range_path, names=["h3_04"], header=None). \ + sort_values("h3_04").set_index("h3_04").sort_index() + taxon_range_df = InatInferrer.add_lat_lng_to_h3_geo_dataframe(taxon_range_df) + if bounds: + taxon_range_df = InatInferrer.filter_geo_dataframe_by_bounds(taxon_range_df, bounds) + taxon_range_df["value"] = 1 + return dict(zip(taxon_range_df.index.astype(str), taxon_range_df["value"])) + + def h3_04_taxon_range_comparison(self, taxon_id, bounds=[]): + geomodel_results = self.h3_04_geo_results_for_taxon(taxon_id, bounds, thresholded=True) or {} + taxon_range_results = self.h3_04_taxon_range(taxon_id, bounds) or {} + combined_results = {} + for cell_key in geomodel_results: + if cell_key in taxon_range_results: + combined_results[cell_key] = 0.5 + else: + combined_results[cell_key] = 0 + for cell_key in taxon_range_results: + if cell_key not in geomodel_results: + combined_results[cell_key] = 1 + return combined_results + + def h3_04_bounds(self, taxon_id): + geomodel_results = self.h3_04_geo_results_for_taxon( + taxon_id, bounds=None, thresholded=True, raw_results=True) + if geomodel_results is None: + return + return { + "swlat": geomodel_results["lat"].min(), + "swlng": geomodel_results["lng"].min(), + "nelat": geomodel_results["lat"].max(), + "nelng": geomodel_results["lng"].max() + } + + @staticmethod + def add_lat_lng_to_h3_geo_dataframe(geo_df): + geo_df = geo_df.h3.h3_to_geo() + geo_df["lng"] = geo_df["geometry"].x + geo_df["lat"] = geo_df["geometry"].y + geo_df.pop("geometry") + return geo_df + + @staticmethod + def filter_geo_dataframe_by_bounds(geo_df, bounds): + # this is querying on the centroid, but cells just outside the bounds may have a + # centroid outside the bounds while part of the polygon is within the bounds. Add + # a small buffer to ensure this returns any cell whose polygon is + # even partially within the bounds + buffer = 0.6 + + # similarly, the centroid may be on the other side of the antimedirian, so lookup + # cells that might be just over the antimeridian on either side + antimedirian_condition = "" + if bounds[1] < -179: + antimedirian_condition = "or (lng > 179)" + elif bounds[3] > 179: + antimedirian_condition = "or (lng < -179)" + + # query for cells wtihin the buffered bounds, and potentially + # on the other side of the antimeridian + query = f'lat >= {bounds[0] - buffer} and lat <= {bounds[2] + buffer} and ' + \ + f' ((lng >= {bounds[1] - buffer} and lng <= {bounds[3] + buffer})' + \ + f' {antimedirian_condition})' + return geo_df.query(query) diff --git a/lib/inat_vision_api.py b/lib/inat_vision_api.py index 1cdb8f1..8703c43 100644 --- a/lib/inat_vision_api.py +++ b/lib/inat_vision_api.py @@ -8,7 +8,6 @@ from flask import Flask, request, render_template from web_forms import ImageForm from inat_inferrer import InatInferrer -from model_scoring import ModelScoring class InatVisionAPI: @@ -18,12 +17,59 @@ def __init__(self, config): self.app = Flask(__name__) self.app.secret_key = config["app_secret"] self.upload_folder = "static/" - self.app.add_url_rule( - "/", "index", self.index_route, methods=["GET", "POST"]) + self.app.add_url_rule("/", "index", self.index_route, methods=["GET", "POST"]) + self.app.add_url_rule("/h3_04", "h3_04", self.h3_04_route, methods=["GET"]) + self.app.add_url_rule("/h3_04_taxon_range", "h3_04_taxon_range", + self.h3_04_taxon_range_route, methods=["GET"]) + self.app.add_url_rule("/h3_04_taxon_range_comparison", "h3_04_taxon_range_comparison", + self.h3_04_taxon_range_comparison_route, methods=["GET"]) + self.app.add_url_rule("/h3_04_bounds", "h3_04_bounds", + self.h3_04_bounds_route, methods=["GET"]) def setup_inferrer(self, config): self.inferrer = InatInferrer(config) + def h3_04_route(self): + return self.h3_04_default_route(self.inferrer.h3_04_geo_results_for_taxon) + + def h3_04_taxon_range_route(self): + if "taxon_ranges_path" not in self.inferrer.config: + return "taxon range data unavilable", 422 + return self.h3_04_default_route(self.inferrer.h3_04_taxon_range) + + def h3_04_taxon_range_comparison_route(self): + if "taxon_ranges_path" not in self.inferrer.config: + return "taxon range data unavilable", 422 + return self.h3_04_default_route(self.inferrer.h3_04_taxon_range_comparison) + + def h3_04_default_route(self, h3_04_method): + taxon_id, error_message, error_code = self.valid_leaf_taxon_id_for_request(request) + if error_message: + return error_message, error_code + + bounds, error_message, error_code = self.valid_bounds_for_request(request) + if error_message: + return error_message, error_code + + if h3_04_method == self.inferrer.h3_04_geo_results_for_taxon \ + and "thresholded" in request.args and request.args["thresholded"] == "true": + results_dict = h3_04_method(taxon_id, bounds, thresholded=True) + else: + results_dict = h3_04_method(taxon_id, bounds) + if results_dict is None: + return f'Unknown taxon_id {taxon_id}', 422 + return InatVisionAPI.round_floats(results_dict, 8) + + def h3_04_bounds_route(self): + taxon_id, error_message, error_code = self.valid_leaf_taxon_id_for_request(request) + if error_message: + return error_message, error_code + + results_dict = self.inferrer.h3_04_bounds(taxon_id) + if results_dict is None: + return f'Unknown taxon_id {taxon_id}', 422 + return results_dict + def index_route(self): form = ImageForm() if "observation_id" in request.args: @@ -41,6 +87,7 @@ def index_route(self): lng = form.lng.data file_path = None image_uuid = None + iconic_taxon_id = None if observation_id: image_uuid = "downloaded-obs-" + observation_id file_path, lat, lng, iconic_taxon_id = self.download_observation( @@ -48,67 +95,95 @@ def index_route(self): else: image_uuid = str(uuid.uuid4()) file_path = self.process_upload(form.image.data, image_uuid) - iconic_taxon_id = None + if form.taxon_id.data and form.taxon_id.data.isdigit(): + iconic_taxon_id = int(form.taxon_id.data) if file_path is None: return render_template("home.html") - image = self.inferrer.prepare_image_for_inference(file_path, image_uuid) - scores = self.score_image(image, lat, lng, iconic_taxon_id, geomodel) - self.write_logstash(image_uuid, file_path, request_start_datetime, request_start_time) + scores = self.score_image(form, file_path, lat, lng, iconic_taxon_id, geomodel) + InatVisionAPI.write_logstash( + image_uuid, file_path, request_start_datetime, request_start_time) return scores else: return render_template("home.html") - def score_image(self, image, lat, lng, iconic_taxon_id, geomodel): - # Vision - vision_start_time = time.time() - vision_scores = self.inferrer.vision_inferrer.process_image(image, iconic_taxon_id) - vision_total_time = time.time() - vision_start_time - print("Vision Time: %0.2fms" % (vision_total_time * 1000.)) - - if geomodel != "true": - top_x = dict(sorted( - vision_scores.items(), key=lambda x: x[1], reverse=True)[:100]) - to_return = {} - for index, arg in enumerate(top_x): - to_return[arg] = round(vision_scores[arg] * 100, 6) - return to_return - - # Geo - geo_start_time = time.time() - if lat is not None and lat != "" and lng is not None and lng != "": - geo_scores = self.inferrer.geo_model.predict(lat, lng, iconic_taxon_id) + def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel): + score_without_geo = (form.score_without_geo.data == "true") + filter_taxon = self.inferrer.lookup_taxon(iconic_taxon_id) + leaf_scores = self.inferrer.predictions_for_image( + file_path, lat, lng, filter_taxon, score_without_geo + ) + + if form.aggregated.data == "true": + aggregated_results = self.inferrer.aggregate_results(leaf_scores, filter_taxon, + score_without_geo) + columns_to_return = [ + "aggregated_combined_score", + "aggregated_geo_score", + "taxon_id", + "name", + "aggregated_vision_score", + "aggregated_geo_threshold" + ] + column_mapping = { + "taxon_id": "id", + "aggregated_combined_score": "combined_score", + "aggregated_geo_score": "geo_score", + "aggregated_vision_score": "vision_score", + "aggregated_geo_threshold": "geo_threshold" + } + + no_geo_scores = (leaf_scores["geo_score"].max() == 0) + + # set a cutoff where branches whose combined scores are below the threshold are ignored + # TODO: this threshold is completely arbitrary and needs testing + aggregated_results = aggregated_results.query("normalized_aggregated_combined_score > 0.05") + + # after setting a cutoff, get the parent IDs of the remaining taxa + parent_taxon_ids = aggregated_results["parent_taxon_id"].values # noqa: F841 + # the leaves of the pruned taxonomy (not leaves of the original taxonomy), are the + # taxa who are not parents of any remaining taxa + leaf_results = aggregated_results.query("taxon_id not in @parent_taxon_ids") + + leaf_results = leaf_results.sort_values("aggregated_combined_score", ascending=False).head(100) + score_columns = ["aggregated_combined_score", "aggregated_geo_score", + "aggregated_vision_score", "aggregated_geo_threshold"] + leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index") + final_results = leaf_results[columns_to_return].rename(columns=column_mapping) else: - geo_scores = {} - geo_total_time = time.time() - geo_start_time - print("GeoTime: %0.2fms" % (geo_total_time * 1000.)) - - # Scoring - scoring_start_time = time.time() - combined_scores = ModelScoring.combine_vision_and_geo_scores( - vision_scores, geo_scores, - self.inferrer.config["geo_min"] if "geo_min" in self.inferrer.config else 0) - - # results.aggregate_scores() - scoring_total_time = time.time() - scoring_start_time - print("Score Time: %0.2fms" % (scoring_total_time * 1000.)) - - top_x = dict(sorted( - combined_scores.items(), key=lambda x: x[1], reverse=True)[:100]) - to_return = [] - for index, arg in enumerate(top_x): - geo_score = geo_scores[arg] if arg in geo_scores else 0.0000000001 - to_return.append({ - "combined_score": round(combined_scores[arg] * 100, 6), - "vision_score": round(vision_scores[arg] * 100, 6), - "geo_score": round(geo_score * 100, 6), - "id": self.inferrer.taxonomy.taxa[arg].id, - "name": self.inferrer.taxonomy.taxa[arg].name - }) - - total_time = time.time() - vision_start_time - print("Total: %0.2fms" % (total_time * 1000.)) - return to_return + no_geo_scores = (leaf_scores["geo_score"].max() == 0) + top_combined_score = leaf_scores.sort_values("combined_score", ascending=False).head(1)["combined_score"].values[0] + # set a cutoff so results whose combined scores are + # much lower than the best score are not returned + leaf_scores = leaf_scores.query(f'combined_score > {top_combined_score * 0.001}') + + top100 = leaf_scores.sort_values("combined_score", ascending=False).head(100) + score_columns = ["combined_score", "geo_score", "normalized_vision_score", "geo_threshold"] + top100[score_columns] = top100[score_columns].multiply(100, axis="index") + + # legacy dict response + if geomodel != "true": + top_taxon_combined_scores = top100[ + ["taxon_id", "combined_score"] + ].to_dict(orient="records") + return {x["taxon_id"]: x["combined_score"] for x in top_taxon_combined_scores} + + # new array response + columns_to_return = [ + "combined_score", + "geo_score", + "taxon_id", + "name", + "normalized_vision_score", + "geo_threshold" + ] + column_mapping = { + "taxon_id": "id", + "normalized_vision_score": "vision_score" + } + final_results = top100[columns_to_return].rename(columns=column_mapping) + + return final_results.to_dict(orient="records") def process_upload(self, form_image_data, image_uuid): if form_image_data is None: @@ -137,7 +212,34 @@ def download_observation(self, observation_id, image_uuid): # return the path to the cached image, coordinates, and iconic taxon return cache_path, latlng[0], latlng[1], data["results"][0]["taxon"]["iconic_taxon_id"] - def write_logstash(self, image_uuid, file_path, request_start_datetime, request_start_time): + def valid_leaf_taxon_id_for_request(self, request): + if "taxon_id" in request.args: + taxon_id = request.args["taxon_id"] + else: + return None, "taxon_id required", 422 + if not taxon_id.isdigit(): + return None, "taxon_id must be an integer", 422 + + taxon_id = int(taxon_id) + if float(taxon_id) not in self.inferrer.taxonomy.leaf_df["taxon_id"].values: + return None, f'Unknown taxon_id {taxon_id}', 422 + return taxon_id, None, None + + def valid_bounds_for_request(self, request): + bounds = [] + if "swlat" in request.args: + try: + swlat = float(request.args["swlat"]) + swlng = float(request.args["swlng"]) + nelat = float(request.args["nelat"]) + nelng = float(request.args["nelng"]) + except ValueError: + return None, "bounds must be floats", 422 + bounds = [swlat, swlng, nelat, nelng] + return bounds, None, None + + @staticmethod + def write_logstash(image_uuid, file_path, request_start_datetime, request_start_time): request_end_time = time.time() request_time = round((request_end_time - request_start_time) * 1000, 6) logstash_log = open('log/logstash.log', 'a') @@ -149,3 +251,13 @@ def write_logstash(self, image_uuid, file_path, request_start_datetime, request_ json.dump(log_data, logstash_log) logstash_log.write("\n") logstash_log.close() + + @staticmethod + def round_floats(o, sig): + if isinstance(o, float): + return round(o, sig) + if isinstance(o, dict): + return {k: InatVisionAPI.round_floats(v, sig) for k, v in o.items()} + if isinstance(o, (list, tuple)): + return [InatVisionAPI.round_floats(x, sig) for x in o] + return o diff --git a/lib/model_scoring.py b/lib/model_scoring.py deleted file mode 100644 index 645de55..0000000 --- a/lib/model_scoring.py +++ /dev/null @@ -1,11 +0,0 @@ -class ModelScoring: - - @staticmethod - def combine_vision_and_geo_scores(vision_scores, geo_scores, geo_min=0): - combined_scores = {} - for arg in vision_scores: - geo_score = geo_scores[arg] if arg in geo_scores else 0.0000000001 - if geo_min > 0 and geo_score < geo_min: - geo_score = geo_min - combined_scores[arg] = vision_scores[arg] * geo_score - return combined_scores diff --git a/lib/model_taxonomy.py b/lib/model_taxonomy.py index 0362587..d390516 100644 --- a/lib/model_taxonomy.py +++ b/lib/model_taxonomy.py @@ -11,7 +11,7 @@ def __init__(self, path): def load_mapping(self, path): self.node_key_to_leaf_class_id = {} self.leaf_class_to_taxon = {} - # there is no taxon with ID 0, but roots of the taxonomy with have a parent ID of 0, + # there is no taxon with ID 0, but roots of the taxonomy have a parent ID of 0, # so create a fake taxon of Life to represent the root of the entire tree self.taxa = {0: Taxon({"name": "Life", "depth": 0})} self.taxon_children = {} diff --git a/lib/model_taxonomy_dataframe.py b/lib/model_taxonomy_dataframe.py new file mode 100644 index 0000000..a707c7d --- /dev/null +++ b/lib/model_taxonomy_dataframe.py @@ -0,0 +1,74 @@ +import math +import pandas as pd + + +class ModelTaxonomyDataframe: + + def __init__(self, path, thresholds_path): + self.load_mapping(path, thresholds_path) + + def load_mapping(self, path, thresholds_path): + self.df = pd.read_csv(path) + # left and right will be used to store nested set indices + self.df["left"] = pd.Series([], dtype=object) + self.df["right"] = pd.Series([], dtype=object) + self.taxon_children = {} + self.taxon_row_mapping = {} + self.taxon_ancestors = {} + for index, taxon in self.df.iterrows(): + self.taxon_row_mapping[taxon["taxon_id"]] = index + parent_id = 0 if math.isnan(taxon["parent_taxon_id"]) else int(taxon["parent_taxon_id"]) + if parent_id not in self.taxon_children: + self.taxon_children[parent_id] = [] + self.taxon_children[parent_id].append(taxon["taxon_id"]) + self.assign_nested_values() + self.df = self.df.set_index("taxon_id", drop=False).sort_index() + if thresholds_path is not None: + thresholds = pd.read_csv(thresholds_path)[["taxon_id", "thres"]]. \ + rename(columns={"thres": "geo_threshold"}).set_index("taxon_id").sort_index() + self.df = self.df.join(thresholds) + + # create a data frame with just the leaf taxa using leaf_class_id as the index + self.leaf_df = self.df.query("leaf_class_id.notnull()").set_index( + "leaf_class_id", drop=False).sort_index() + + # calculate nested set left and right values. These can be later used for an efficient + # way to calculate if a taxon is an ancestor or descendant of another + def assign_nested_values(self, taxon_id=0, index=0, ancestor_taxon_ids=[]): + for child_id in self.taxon_children[taxon_id]: + self.df.at[self.taxon_row_mapping[child_id], "left"] = index + self.taxon_ancestors[child_id] = ancestor_taxon_ids + index += 1 + if child_id in self.taxon_children: + child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id] + index = self.assign_nested_values(child_id, index, child_ancestor_taxon_ids) + self.df.at[self.taxon_row_mapping[child_id], "right"] = index + index += 1 + return index + + @staticmethod + def children(df, taxon_id): + if taxon_id == 0: + return df.query("parent_taxon_id.isnull()") + return df.query(f'parent_taxon_id == {taxon_id}') + + @staticmethod + def print(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None): + children = ModelTaxonomyDataframe.children(df, taxon_id) + index = 0 + if "aggregated_combined_score" in children: + children = children.sort_values("aggregated_combined_score", ascending=False) + else: + children = children.sort_values("name") + for row in children.itertuples(): + last_in_branch = (index == len(children) - 1) + index += 1 + icon = "└──" if last_in_branch else "├──" + prefixIcon = " " if last_in_branch else "│ " + print(f'{ancestor_prefix}{icon}', end="") + if display_taxon_lambda is None: + print(f'{row.name} :: {row.left}:{row.right}') + else: + print(display_taxon_lambda(row)) + if row.right != row.left + 1: + ModelTaxonomyDataframe.print(df, row.taxon_id, f"{ancestor_prefix}{prefixIcon}", display_taxon_lambda) diff --git a/lib/model_test_data_exporter.py b/lib/model_test_data_exporter.py new file mode 100644 index 0000000..df60bad --- /dev/null +++ b/lib/model_test_data_exporter.py @@ -0,0 +1,204 @@ +import pandas as pd +import requests +import json +import prison +import re +from datetime import datetime + + +class ModelTestDataExporter: + + def __init__(self, **args): + self.cmd_args = args + self.load_train_data_photo_ids() + + def load_train_data_photo_ids(self, train_data_paths=[]): + if not self.cmd_args["exclude_train_photos_path"]: + self.train_data_photo_ids = [] + return + self.train_data_photo_ids = pd.concat( + map(lambda x: pd.read_csv(x, usecols=["photo_id"]), + self.cmd_args["exclude_train_photos_path"]) + ).drop_duplicates("photo_id").set_index("photo_id").sort_index().index + + def generate_from_cmd_args(self): + additional_parameters = {} + if self.cmd_args["place_id"]: + additional_parameters["place_id"] = self.cmd_args["place_id"] + if self.cmd_args["taxon_id"]: + additional_parameters["taxon_id"] = self.cmd_args["taxon_id"] + currentDatetime = datetime.now() + timestamp = currentDatetime.strftime("%Y%m%d") + export_path = f'test-obs-{timestamp}' + if additional_parameters: + parameter_string = "-".join(map(lambda index: f'{index}-{additional_parameters[index]}', + additional_parameters)) + export_path += "-" + parameter_string + if "filename_suffix" in self.cmd_args and self.cmd_args["filename_suffix"]: + export_path += "-" + self.cmd_args["filename_suffix"] + export_path += ".csv" + self.generate_test_data(export_path, self.cmd_args["limit"], additional_parameters) + + def export_test_data_parameters(self, additional_parameters={}): + api_parameters = {} + api_parameters["quality_grade"] = "research,casual" + api_parameters["rank"] = "species" + api_parameters["photos"] = "true" + api_parameters["geo"] = "true" + api_parameters["identified"] = "true" + api_parameters["per_page"] = 200 + api_parameters["order_by"] = "random" + api_parameters["identifications"] = "most_agree" + api_parameters["ttl"] = -1 + api_parameters.update(additional_parameters) + print(api_parameters) + + fields = { + "quality_grade": True, + "observed_on_details": { + "date": True + }, + "location": True, + "photos": { + "url": True + }, + "community_taxon_id": True, + "taxon": { + "id": True, + "ancestor_ids": True, + "iconic_taxon_id": True + }, + "quality_metrics": { + "agree": True, + "metric": True + } + } + api_parameters["fields"] = prison.dumps(fields) + return api_parameters + + def process_api_response(self, api_parameters, used_observations): # noqa: C901 + response = requests.get("https://api.inaturalist.org/v2/observations", + params=api_parameters) + json_object = response.json() + useable_rows = [] + for row in json_object["results"]: + if row["uuid"] in used_observations: + continue + + # must have a taxon and observed_on_details + if not row["taxon"] or "observed_on_details" not in row \ + or not row["observed_on_details"] or "taxon" not in row \ + or "iconic_taxon_id" not in row["taxon"] or not row["taxon"]["iconic_taxon_id"]: + used_observations[row["uuid"]] = True + continue + + # must pass quality metrics except wild + metric_counts = {} + for metric in row["quality_metrics"]: + if metric["metric"] not in metric_counts: + metric_counts[metric["metric"]] = 0 + if metric["agree"]: + metric_counts[metric["metric"]] += 1 + else: + metric_counts[metric["metric"]] -= 1 + if ("location" in metric_counts and metric_counts["location"] < 0) \ + or ("evidence" in metric_counts and metric_counts["evidence"] < 0) \ + or ("date" in metric_counts and metric_counts["date"] < 0) \ + or ("recent" in metric_counts and metric_counts["recent"] < 0): + used_observations[row["uuid"]] = True + continue + + # check if any photos are included in the test data + photo_in_training_data = False + for photo in row["photos"]: + if photo["id"] in self.train_data_photo_ids: + photo_in_training_data = True + break + if photo_in_training_data is True: + used_observations[row["uuid"]] = True + continue + if re.search(r"\.jpe?g", row["photos"][0]["url"]) is None: + used_observations[row["uuid"]] = True + continue + + if row["quality_grade"] == "casual" and not (row["community_taxon_id"] and row["community_taxon_id"] == row["taxon"]["id"]): + used_observations[row["uuid"]] = True + continue + + useable_rows.append(row) + used_observations[row["uuid"]] = True + return useable_rows + + def generate_test_data(self, filename, max_results=5000, additional_parameters={}): + iterations_with_zero_results = 0 + rows_to_use = [] + used_observations = {} + api_parameters = self.export_test_data_parameters(additional_parameters) + + while len(rows_to_use) < max_results and iterations_with_zero_results < 5: + print(f'Fetching more results... {len(rows_to_use)} so far') + useable_rows = self.process_api_response(api_parameters, used_observations) + print(f'{len(useable_rows)} this batch') + if not useable_rows: + iterations_with_zero_results += 1 + continue + iterations_with_zero_results = 0 + rows_to_use += useable_rows + + columns = [ + "observation_id", + "observed_on", + "iconic_taxon_id", + "taxon_id", + "taxon_ancestry", + "lat", + "lng", + "photo_url" + ] + output_file = open(filename, "w") + output_file.write(",".join(columns) + "\n") + for row in rows_to_use[:max_results]: + [latitude, longitude] = row["location"].split(",") + columns_to_write = [ + row["uuid"], + row["observed_on_details"]["date"], + row["taxon"]["iconic_taxon_id"], + row["taxon"]["id"], + "/".join(map(str, row["taxon"]["ancestor_ids"])), + latitude, + longitude, + row["photos"][0]["url"].replace("square", "medium").replace("http://", "https://") + ] + output_file.write(",".join(map(str, columns_to_write)) + "\n") + + def generate_standard_set(self): + files_to_generate = { + "global": {}, + "fungi": {"taxon_id": 47170}, + "insecta": {"taxon_id": 47158}, + "mammalia": {"taxon_id": 40151}, + "plantae": {"taxon_id": 47126}, + "actinopterygii": {"taxon_id": 47178}, + "reptilia": {"taxon_id": 26036}, + "amphibia": {"taxon_id": 20978}, + "arachnida": {"taxon_id": 47119}, + "aves": {"taxon_id": 3}, + "animalia": {"taxon_id": 1}, + "mollusca": {"taxon_id": 47115}, + "north-america": {"place_id": 97394}, + "south-america": {"place_id": 97389}, + "europe": {"place_id": 97391}, + "asia": {"place_id": 97395}, + "africa": {"place_id": 97392}, + "oceania": {"place_id": 97393} + } + + currentDatetime = datetime.now() + timestamp = currentDatetime.strftime("%Y%m%d") + + for key in files_to_generate: + export_path = f'test-obs-{timestamp}-{key}' + if "filename_suffix" in self.cmd_args and self.cmd_args["filename_suffix"]: + export_path += "-" + self.cmd_args["filename_suffix"] + export_path += ".csv" + self.generate_test_data(export_path, self.cmd_args["limit"], files_to_generate[key]) diff --git a/lib/pt_geo_prior_model.py b/lib/pt_geo_prior_model.py index 218a0d0..b81b4e9 100644 --- a/lib/pt_geo_prior_model.py +++ b/lib/pt_geo_prior_model.py @@ -29,7 +29,7 @@ def predict(self, latitude, longitude, filter_taxon_id=None): filter_taxon = None if filter_taxon_id is not None: try: - filter_taxon = self.taxonomy.taxa[filter_taxon_id] + filter_taxon = self.taxonomy.df.iloc[filter_taxon_id] except Exception as e: print(f'filter_taxon `{filter_taxon_id}` does not exist in the taxonomy') raise e diff --git a/lib/templates/home.html b/lib/templates/home.html index baad79e..37244e9 100644 --- a/lib/templates/home.html +++ b/lib/templates/home.html @@ -13,11 +13,11 @@