Skip to content

Commit

Permalink
refactor vision_testing to record obs-level stats, test directories, …
Browse files Browse the repository at this point in the history
…output to CSV; alternate aggregated scoring
  • Loading branch information
pleary committed Feb 9, 2024
1 parent 96cdf3d commit c4a946f
Show file tree
Hide file tree
Showing 8 changed files with 423 additions and 257 deletions.
1 change: 1 addition & 0 deletions lib/inat_inferrer.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ def aggregate_results(self, leaf_scores, filter_taxon, score_without_geo=False,
print("\nTree of aggregated results:")
ModelTaxonomyDataframe.print(thresholded_results, display_taxon_lambda=(
lambda row: f"{row.name} ["
f"ID:{row.taxon_id}, "
f"V:{round(row.aggregated_vision_score, 4)}, "
f"G:{round(row.aggregated_geo_score, 4)}, "
f"C:{round(row.aggregated_combined_score, 4)}, "
Expand Down
112 changes: 92 additions & 20 deletions lib/inat_vision_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import urllib
import uuid
import json
import pandas as pd

from flask import Flask, request, render_template
from web_forms import ImageForm
from inat_inferrer import InatInferrer
from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe


class InatVisionAPI:
Expand Down Expand Up @@ -107,6 +109,52 @@ def index_route(self):
else:
return render_template("home.html")

def best_leaves_from_aggregated_results(self, aggregated_results, iteration=0):
# use a lower threshold on the first pass to have higher representation from
# original model leaf taxa
selection_score_threshold = 0.05 if iteration == 0 else 0.1
remaining_results = aggregated_results.query(
f"selection_score > {selection_score_threshold}"
)
# set a rank level cutoff on higher taxa to include in results
if iteration > 0:
remaining_results = remaining_results.query(
"rank_level <= 30"
)
# after setting a cutoff, get the parent IDs of the remaining taxa
parent_taxon_ids = remaining_results["parent_taxon_id"].values # noqa: F841
# the leaves of the pruned taxonomy (not leaves of the original taxonomy), are the
# taxa who are not parents of any remaining taxa
leaf_results = remaining_results.query("taxon_id not in @parent_taxon_ids")

# lower the scores of ancestors by the scores of the taxa being moved into the result set
for selection_score, aggregated_combined_score, left, right in zip(
leaf_results["selection_score"],
leaf_results["aggregated_combined_score"],
leaf_results["left"],
leaf_results["right"]
):
self_and_ancestors = remaining_results.query(
f"left <= {left} and right >= {right}"
)
remaining_results.loc[
self_and_ancestors.index,
"selection_score"
] -= selection_score
remaining_results.loc[
self_and_ancestors.index,
"aggregated_combined_score"
] -= aggregated_combined_score

# stop picking taxa if one represents more than 80% of aggregated scores
if leaf_results["normalized_aggregated_combined_score"].max() >= 0.8:
remaining_results = pd.DataFrame()
else:
remaining_results = remaining_results.query(
"selection_score > 0.1"
)
return [leaf_results, remaining_results]

def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel):
score_without_geo = (form.score_without_geo.data == "true")
filter_taxon = self.inferrer.lookup_taxon(iconic_taxon_id)
Expand All @@ -117,6 +165,50 @@ def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel):
if form.aggregated.data == "true":
aggregated_results = self.inferrer.aggregate_results(leaf_scores, filter_taxon,
score_without_geo)
if form.format.data == "tree":
aggregated_results = aggregated_results.query(
"normalized_aggregated_combined_score > 0.001"
)
printable_tree = ModelTaxonomyDataframe.printable_tree(
aggregated_results,
display_taxon_lambda=(
lambda row: f"{row.name}\t\t["
f"ID:{row.taxon_id}, "
f"V:{round(row.aggregated_vision_score, 4)}, "
f"G:{round(row.aggregated_geo_score, 4)}, "
f"C:{round(row.aggregated_combined_score, 4)}, "
f"NC:{round(row.normalized_aggregated_combined_score, 4)}]"
)
)
return "<pre>" + "<br/>".join(printable_tree) + "</pre>"

aggregated_results = aggregated_results.query(
"normalized_aggregated_combined_score > 0.05"
)

aggregated_results["selection_score"] = aggregated_results[
"normalized_aggregated_combined_score"
]
iteration = 0
leaf_results, remaining_results = self.best_leaves_from_aggregated_results(
aggregated_results, iteration
)
while len(remaining_results.index) > 0:
iteration += 1
next_leaf_results, remaining_results = self.best_leaves_from_aggregated_results(
remaining_results, iteration
)
leaf_results = pd.concat([leaf_results, next_leaf_results])

leaf_results = leaf_results.sort_values(
"aggregated_combined_score",
ascending=False
).head(100)

score_columns = ["aggregated_combined_score", "aggregated_geo_score",
"aggregated_vision_score", "aggregated_geo_threshold"]
leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index")

columns_to_return = [
"aggregated_combined_score",
"aggregated_geo_score",
Expand All @@ -132,26 +224,6 @@ def score_image(self, form, file_path, lat, lng, iconic_taxon_id, geomodel):
"aggregated_vision_score": "vision_score",
"aggregated_geo_threshold": "geo_threshold"
}

# set a cutoff where branches whose combined scores are below the threshold are ignored
# TODO: this threshold is completely arbitrary and needs testing
aggregated_results = aggregated_results.query(
"normalized_aggregated_combined_score > 0.05"
)

# after setting a cutoff, get the parent IDs of the remaining taxa
parent_taxon_ids = aggregated_results["parent_taxon_id"].values # noqa: F841
# the leaves of the pruned taxonomy (not leaves of the original taxonomy), are the
# taxa who are not parents of any remaining taxa
leaf_results = aggregated_results.query("taxon_id not in @parent_taxon_ids")

leaf_results = leaf_results.sort_values(
"aggregated_combined_score",
ascending=False
).head(100)
score_columns = ["aggregated_combined_score", "aggregated_geo_score",
"aggregated_vision_score", "aggregated_geo_threshold"]
leaf_results[score_columns] = leaf_results[score_columns].multiply(100, axis="index")
final_results = leaf_results[columns_to_return].rename(columns=column_mapping)
else:
top_combined_score = leaf_scores.sort_values(
Expand Down
21 changes: 15 additions & 6 deletions lib/model_taxonomy_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ def load_mapping(self, path, thresholds_path):
def assign_nested_values(self, taxon_id=0, index=0, ancestor_taxon_ids=[]):
for child_id in self.taxon_children[taxon_id]:
self.df.at[self.taxon_row_mapping[child_id], "left"] = index
self.taxon_ancestors[child_id] = ancestor_taxon_ids
child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id]
self.taxon_ancestors[child_id] = child_ancestor_taxon_ids
index += 1
if child_id in self.taxon_children:
child_ancestor_taxon_ids = ancestor_taxon_ids + [child_id]
index = self.assign_nested_values(child_id, index, child_ancestor_taxon_ids)
self.df.at[self.taxon_row_mapping[child_id], "right"] = index
index += 1
Expand All @@ -73,26 +73,35 @@ def children(df, taxon_id):

@staticmethod
def print(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None):
print("\n".join(ModelTaxonomyDataframe.printable_tree(
df, taxon_id, ancestor_prefix, display_taxon_lambda
)))

@staticmethod
def printable_tree(df, taxon_id=0, ancestor_prefix="", display_taxon_lambda=None):
children = ModelTaxonomyDataframe.children(df, taxon_id)
index = 0
if "aggregated_combined_score" in children:
children = children.sort_values("aggregated_combined_score", ascending=False)
else:
children = children.sort_values("name")
linesToPrint = []
for row in children.itertuples():
last_in_branch = (index == len(children) - 1)
index += 1
icon = "└──" if last_in_branch else "├──"
prefixIcon = " " if last_in_branch else "│ "
print(f"{ancestor_prefix}{icon}", end="")
lineToPrint = f"{ancestor_prefix}{icon}"
if display_taxon_lambda is None:
print(f"{row.name} :: {row.left}:{row.right}")
lineToPrint += f"{row.name} :: {row.left}:{row.right}"
else:
print(display_taxon_lambda(row))
lineToPrint += display_taxon_lambda(row)
linesToPrint.append(lineToPrint)
if row.right != row.left + 1:
ModelTaxonomyDataframe.print(
linesToPrint += ModelTaxonomyDataframe.printable_tree(
df,
row.taxon_id,
f"{ancestor_prefix}{prefixIcon}",
display_taxon_lambda
)
return linesToPrint
4 changes: 2 additions & 2 deletions lib/model_test_data_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,15 @@ async def generate_test_data(self):
await self.fetch_more_data()

async def fetch_more_data(self):
self.queue = asyncio.Queue(ModelTestDataExporter.N_WORKERS)
self.queue = asyncio.Queue()
self.workers = [asyncio.create_task(self.worker_task())
for _ in range(ModelTestDataExporter.N_WORKERS)]
min_pages_remaining = math.ceil(
(self.max_results / ModelTestDataExporter.API_REQUEST_PER_PAGE)
)
print(f"Queueing {min_pages_remaining} workers")
for i in range(min_pages_remaining):
await self.queue.put(i)
self.queue.put_nowait(i)
await self.queue.join()
for worker in self.workers:
worker.cancel()
Expand Down
4 changes: 3 additions & 1 deletion lib/templates/home.html
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,20 @@ <h2>Slim vs Legacy Model</h2>
Lng: <input type="test" name="lng" value="-70">
<br/>
<select name="format">
<option value="html">HTML</option>
<option value="json">JSON</option>
<option value="tree">Tree</option>
</select>
<br/>
<select name="geomodel">
<option value="true">With Geo</option>
<option value="false">Original</option>
</select>
<br/>
<select name="aggregated">
<option value="false">Original</option>
<option value="true">Aggregated</option>
</select>
<br/>
<select name="score_without_geo">
<option value="false">Geo affects score</option>
<option value="true">Geo does not affect score</option>
Expand Down
3 changes: 3 additions & 0 deletions lib/test_observation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ class TestObservation:
def __init__(self, row):
row["taxon_ancestry"] = row["taxon_ancestry"].split("/")
row["taxon_ancestry"] = list(map(int, row["taxon_ancestry"]))
# remove life
row["taxon_ancestry"].pop(0)
for key in row:
setattr(self, key, row[key])
self.inferrer_scores = None
self.summarized_results = {}
Loading

0 comments on commit c4a946f

Please sign in to comment.