diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..9bd7a08 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,34 @@ +name: inatVisionAPI CI + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v4 + - name: Use Python + uses: actions/setup-python@v4 + with: + python-version: '3.11.6' + cache: 'pip' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + - name: Test with pytest + run: | + coverage run -m pytest -s && coverage report --show-missing + + notify: + name: Notify Slack + needs: build + if: ${{ success() || failure() }} + runs-on: ubuntu-20.04 + steps: + - uses: iRoachie/slack-github-actions@v2.3.2 + if: env.SLACK_WEBHOOK_URL != null + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_BUILDS_WEBHOOK_URL }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/CICD-dev.yml b/.github/workflows/CICD-dev.yml index 086b6fa..5bddd82 100644 --- a/.github/workflows/CICD-dev.yml +++ b/.github/workflows/CICD-dev.yml @@ -8,26 +8,26 @@ on: jobs: build-and-test: - name: Build/Test + name: Build/Test runs-on: ubuntu-20.04 - steps: + steps: - uses: actions/checkout@v4 build-and-push-dev-docker-image: - name: Build/Push Dev Docker Image + name: Build/Push Dev Docker Image runs-on: ubuntu-20.04 - steps: + steps: - uses: actions/checkout@v4 - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + - name: Build/Push dev vision-api uses: docker/build-push-action@v5 with: diff --git a/.github/workflows/CICD-main.yml b/.github/workflows/CICD-main.yml index 8cdfc3b..e000de7 100644 --- a/.github/workflows/CICD-main.yml +++ b/.github/workflows/CICD-main.yml @@ -7,26 +7,26 @@ on: jobs: build-and-test: - name: Build/Test + name: Build/Test runs-on: ubuntu-20.04 - steps: + steps: - uses: actions/checkout@v4 build-and-push-staging-docker-image: - name: Build/Push Staging Docker Image + name: Build/Push Staging Docker Image runs-on: ubuntu-20.04 - steps: + steps: - uses: actions/checkout@v4 - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + - name: Build/Push staging vision-api uses: docker/build-push-action@v5 with: diff --git a/docker/logrotate-cron.sh b/docker/logrotate-cron.sh index 8ead430..873973f 100644 --- a/docker/logrotate-cron.sh +++ b/docker/logrotate-cron.sh @@ -1,2 +1,2 @@ #!/bin/sh -/usr/sbin/logrotate -s /var/vision/script/logrotate.status /var/vision/script/logrotate.conf \ No newline at end of file +/usr/sbin/logrotate -s /var/vision/script/logrotate.status /var/vision/script/logrotate.conf \ No newline at end of file diff --git a/lib/inat_inferrer.py b/lib/inat_inferrer.py index 1a6fb67..2ae5e92 100644 --- a/lib/inat_inferrer.py +++ b/lib/inat_inferrer.py @@ -24,6 +24,7 @@ class InatInferrer: def __init__(self, config): self.config = config self.setup_taxonomy(config) + self.setup_synonyms(config) self.setup_vision_model(config) self.setup_elevation_dataframe(config) self.setup_geo_model(config) @@ -35,8 +36,15 @@ def setup_taxonomy(self, config): config["tf_elev_thresholds"] if "tf_elev_thresholds" in config else None ) + def setup_synonyms(self, config): + self.synonyms = None + if "synonyms_path" in config: + if not os.path.exists(config["synonyms_path"]): + return None + self.synonyms = pd.read_csv(config["synonyms_path"]) + def setup_vision_model(self, config): - self.vision_inferrer = VisionInferrer(config["vision_model_path"], self.taxonomy) + self.vision_inferrer = VisionInferrer(config["vision_model_path"]) def setup_elevation_dataframe(self, config): self.geo_elevation_cells = None @@ -72,20 +80,6 @@ def setup_geo_model(self, config): elevation=list(self.geo_elevation_cells.elevation) ) - def prepare_image_for_inference(self, file_path): - mime_type = magic.from_file(file_path, mime=True) - # attempt to convert non jpegs - if mime_type != "image/jpeg": - im = Image.open(file_path) - image = im.convert("RGB") - else: - image = tf.io.read_file(file_path) - image = tf.image.decode_jpeg(image, channels=3) - image = tf.image.convert_image_dtype(image, tf.float32) - image = tf.image.central_crop(image, 0.875) - image = tf.image.resize(image, [299, 299], tf.image.ResizeMethod.NEAREST_NEIGHBOR) - return tf.expand_dims(image, 0) - def vision_predict(self, image, debug=False): if debug: start_time = time.time() @@ -125,7 +119,7 @@ def predictions_for_image(self, file_path, lat, lng, filter_taxon, score_without debug=False): if debug: start_time = time.time() - image = self.prepare_image_for_inference(file_path) + image = InatInferrer.prepare_image_for_inference(file_path) raw_vision_scores = self.vision_predict(image, debug) raw_geo_scores = self.geo_model_predict(lat, lng, debug) top100 = self.combine_results(raw_vision_scores, raw_geo_scores, filter_taxon, @@ -359,6 +353,21 @@ def h3_04_bounds(self, taxon_id): "nelng": geomodel_results["lng"].max() } + @staticmethod + def prepare_image_for_inference(file_path): + mime_type = magic.from_file(file_path, mime=True) + # attempt to convert non jpegs + if mime_type != "image/jpeg": + im = Image.open(file_path) + image = im.convert("RGB") + else: + image = tf.io.read_file(file_path) + image = tf.image.decode_jpeg(image, channels=3) + image = tf.image.convert_image_dtype(image, tf.float32) + image = tf.image.central_crop(image, 0.875) + image = tf.image.resize(image, [299, 299], tf.image.ResizeMethod.NEAREST_NEIGHBOR) + return tf.expand_dims(image, 0) + @staticmethod def add_lat_lng_to_h3_geo_dataframe(geo_df): geo_df = geo_df.h3.h3_to_geo() @@ -373,7 +382,7 @@ def filter_geo_dataframe_by_bounds(geo_df, bounds): # centroid outside the bounds while part of the polygon is within the bounds. Add # a small buffer to ensure this returns any cell whose polygon is # even partially within the bounds - buffer = 0.6 + buffer = 1.3 # similarly, the centroid may be on the other side of the antimedirian, so lookup # cells that might be just over the antimeridian on either side diff --git a/lib/model_results.py b/lib/model_results.py deleted file mode 100644 index 8b77f3a..0000000 --- a/lib/model_results.py +++ /dev/null @@ -1,220 +0,0 @@ -class ModelResults: - - def __init__(self, vision_results, geo_results, taxonomy): - self.taxonomy = taxonomy - self.vision_results = vision_results - self.geo_results = geo_results - # common_ancestor is currently being used as a first-pass filter to remove - # the least likely results and reduce the number of taxa whose scores to combine. - # NOTE: This may not be helpful and needs testing for accuracy and processing time - self.common_ancestor_threshold = 0.9 - self.common_ancestor_rank_level_threshold = 50 - # fine_common_ancestor is currently being used to return as a high-confidence - # non-leaf taxon that may get presented to a user - self.fine_common_ancestor_threshold = 0.85 - self.fine_common_ancestor_rank_level_threshold = 20 - # vision scores are raw unnormalized scores from the vision model - # geo scores are raw unnormalized scores from the vision model - # combined scores are the unnormalized product of vision and geo scores - # combined_agg scores are the unnormalized sum of combined scores the descendants of a taxon - self.scores = { - "vision": {}, - "vision_agg": {}, - "geo": {}, - "combined": {}, - "combined_agg": {}, - "recursive": {} - } - - self.aggregate_scores() - recursive_results = self.recursive_results() - self.scores["recursive"] = {} - top_x = sorted( - recursive_results, key=lambda x: self.scores["combined_agg"][x], reverse=True)[:100] - for index, arg in enumerate(top_x): - self.scores["recursive"][arg] = self.scores["combined_agg"][arg] - - def aggregate_scores(self): - self.ancestor_scores = {} - self.vision_sum_scores = 0 - # loop through all vision results, calculating the sum of vision scores for each ancestor - for arg in self.vision_results: - taxon = self.taxonomy.taxa[arg] - self.vision_sum_scores += self.vision_results[arg] - # add the score of this leaf result to all of its ancestors - for ancestor in taxon.ancestors: - if ancestor not in self.ancestor_scores: - self.ancestor_scores[ancestor] = 0 - self.ancestor_scores[ancestor] += self.vision_results[arg] - - # using only the vision results, calculate a highly-likely visual common ancestor - # that is no narrower than self.common_ancestor_rank_level_threshold (currently Class). - # Taxa outside the highly-likely visual common ancestor will be ignored - # NOTE: This may not be helpful and needs testing for accuracy and processing time - self.common_ancestor = self.calculate_common_ancestor( - self.ancestor_scores, self.vision_sum_scores, self.common_ancestor_threshold, - self.common_ancestor_rank_level_threshold) - - # loop through all taxa and combine geo and vision scores, calculating - # aggregate scores for non-leaf taxa as well - self.aggregate_scores_recursive() - - # after combining vision and geo scores, look for a potentially more-specific - # common ancestor using the combined scores and different thresholds. - # 0 represents the root taxon, so the combined aggretate score for 0 - # represents the sum of all combined scores of all leaves - sum_of_all_combined_scores = self.scores["combined_agg"][0] - self.fine_common_ancestor = self.calculate_common_ancestor( - self.scores["combined_agg"], sum_of_all_combined_scores, - self.fine_common_ancestor_threshold, self.fine_common_ancestor_rank_level_threshold) - - # given a set of scores, the sum of those scores (so we only need to calculate it once), - # a score threshold, a rank_level threshold, and optionall a taxon (if none is given it starts - # at the root of the taxonomy), resursively find the most specific node that is above - # the specified thresholds - def calculate_common_ancestor(self, ancestor_scores, sum_scores, score_threshold, - rank_level_threshold, taxon=None): - common_ancestor = taxon - taxon_id = 0 if taxon is None else taxon.id - # sort children from most- to least-likely - for child_id in sorted( - self.taxonomy.taxon_children[taxon_id], - key=lambda x: (ancestor_scores[x] if x in ancestor_scores else 0), - reverse=True): - # the child has no scores. This could be the result of pruning scores - # earlier on based on iconic_taxon. If there is no score, skip this branch - if child_id not in ancestor_scores: - break - # if the ratio of this score to the sum of all scores is below the - # score_threshold, then this taxon and its whole branch can be skipped - if (ancestor_scores[child_id] / sum_scores) < score_threshold: - break - child_taxon = self.taxonomy.taxa[child_id] - # if this taxon is below the rank_level_threshold, this branch can be skipped - if child_taxon.rank_level < rank_level_threshold: - continue - # this is a leaf, so return it - if child_id not in self.taxonomy.taxon_children: - return child_taxon - return self.calculate_common_ancestor(ancestor_scores, sum_scores, score_threshold, - rank_level_threshold, child_taxon) - return common_ancestor - - # takes a taxonID of the branch to score, and an indication if the branch is - # already known to be within the common ancestor branch - def aggregate_scores_recursive(self, taxon_id=0, in_common_ancestor=False): - vision_score = 0 - geo_score = 0 - combined_agg_score = 0 - # loop through all children of this iteration's taxon, or root taxon - for child_id in self.taxonomy.taxon_children[taxon_id]: - is_common_ancestor = False - # if there is a common ancestor, and this taxon is not yet known to be in it - if self.common_ancestor and not in_common_ancestor: - if child_id == self.common_ancestor.id: - # keep track that this taxon is the common ancestor, and resursive calls from - # this node down are also within the common ancestor - is_common_ancestor = True - elif child_id not in self.common_ancestor.ancestors: - # skip taxa that are not in the common ancestor branch - continue - # this taxon has children in the model - if child_id in self.taxonomy.taxon_children: - self.aggregate_scores_recursive(child_id, in_common_ancestor or is_common_ancestor) - else: - # this is a leaf taxon in the model - # record the vision and geo scores, using very low default scores for missing values - if child_id in self.vision_results: - child_vision_score = self.vision_results[child_id] - else: - child_vision_score = 0.00000001 - if len(self.geo_results) == 0: - child_geo_score = 1 - elif child_id in self.geo_results: - child_geo_score = self.geo_results[child_id] - else: - child_geo_score = 0.00000001 - self.scores["vision"][child_id] = child_vision_score - self.scores["vision_agg"][child_id] = child_vision_score - self.scores["geo"][child_id] = child_geo_score - # simple muliplication of vision and geo score to get a combined score - self.scores["combined"][child_id] = child_vision_score * child_geo_score - # also keeping track of scores aggregated up the tree. Since this is a leaf node, - # the aggregate branch score is equal to the combined score - self.scores["combined_agg"][child_id] = self.scores["combined"][child_id] - - child_vision_score = self.scores["vision_agg"][child_id] - child_geo_score = self.scores["geo"][child_id] - child_combined_agg_score = self.scores["combined_agg"][child_id] - - # vision scores can just be summed as they'll add up to 1 - vision_score += child_vision_score - # all maintain a sum of the combined scores in the branch. This will not add - # up to 1 and can be a wide range of values. Useful when compared to the sum - # of the combined scores for the entire tree - combined_agg_score += child_combined_agg_score - - # geo scores do not add up to 1, so have the geo score of a - # taxon be the max of the scores of its children - if child_geo_score > geo_score: - geo_score = child_geo_score - # scores have been calculated and summed for all this taxon's descendants, - # so reccord the final scores for this branch - self.scores["vision_agg"][taxon_id] = vision_score - self.scores["geo"][taxon_id] = geo_score - self.scores["combined_agg"][taxon_id] = combined_agg_score - - def recursive_results(self, taxon_id=0): - children = self.taxonomy.taxon_children[taxon_id] - # 0 represents the root taxon, so the combined aggretate score for 0 - # represents the sum of all combined scores of all leaves - sum_of_all_combined_scores = self.scores["combined_agg"][0] - # ignore children whose combined score ratio is less than 0.01 - scored_children = list(filter(lambda x: x in self.scores["combined_agg"] and ( - (self.scores["combined_agg"][x] / sum_of_all_combined_scores) >= 0.0001), children)) - if not scored_children: - return [taxon_id] - # sort children by score from most- to least-likely - scored_children = sorted(scored_children, key=lambda x: self.scores["combined_agg"][x], - reverse=True) - - results = [] - for child_id in scored_children: - # recursively repeat for descendants - if child_id in self.taxonomy.taxon_children: - child_results = self.recursive_results(child_id) - if child_results: - results = results + child_results - else: - results.append(child_id) - return results - - # prints to the console a tree prepresenting the most likely taxa and their - # aggregate combined score ratio. e.g. if all combined scores add up to 0.5 - # and a taxon has a combined score of 0.1, its combined score ratio will be 20%, or 0.2 - def print(self, taxon_id=0, ancestor_prefix=""): - children = self.taxonomy.taxon_children[taxon_id] - # 0 represents the root taxon, so the combined aggretate score for 0 - # represents the sum of all combined scores of all leaves - sum_of_all_commbined_scores = self.scores["combined_agg"][0] - # ignore children whose combined score ration is less than 0.01 - scored_children = list(filter(lambda x: x in self.scores["combined_agg"] and ( - (self.scores["combined_agg"][x] / sum_of_all_commbined_scores) >= 0.005), children)) - # sort children by score from most- to least-likely - scored_children = sorted(scored_children, key=lambda x: self.scores["combined_agg"][x], - reverse=True) - - index = 0 - for child_id in scored_children: - # some logic for visual tree indicators when printing - last_in_branch = (index == len(scored_children) - 1) - index += 1 - icon = "└──" if last_in_branch else "├──" - prefixIcon = " " if last_in_branch else "│ " - taxon = self.taxonomy.taxa[child_id] - # print the taxon with its combined score ratio - combined_score_ratio = self.scores["combined_agg"][child_id] / self.scores["combined_agg"][0] - print(f'{ancestor_prefix}{icon}{taxon.name} ({child_id}) :: {combined_score_ratio:.10f}') - # recursively repeat for descendants - if child_id in self.taxonomy.taxon_children: - self.print(child_id, f'{ancestor_prefix}{prefixIcon}') diff --git a/lib/res_layer.py b/lib/res_layer.py new file mode 100644 index 0000000..dc7ade3 --- /dev/null +++ b/lib/res_layer.py @@ -0,0 +1,28 @@ +import tensorflow as tf + + +class ResLayer(tf.keras.layers.Layer): + def __init__(self): + super(ResLayer, self).__init__() + self.w1 = tf.keras.layers.Dense( + 256, + activation="relu", + kernel_initializer="he_normal" + ) + self.w2 = tf.keras.layers.Dense( + 256, + activation="relu", + kernel_initializer="he_normal" + ) + self.dropout = tf.keras.layers.Dropout(rate=0.5) + self.add = tf.keras.layers.Add() + + def call(self, inputs): + x = self.w1(inputs) + x = self.dropout(x) + x = self.w2(x) + x = self.add([x, inputs]) + return x + + def get_config(self): + return {} diff --git a/lib/taxon.py b/lib/taxon.py index bc8b703..63e169a 100644 --- a/lib/taxon.py +++ b/lib/taxon.py @@ -14,7 +14,7 @@ class Taxon: def __init__(self, row): for key in row: - setattr(self, key, row[key]) + self.set(key, row[key]) def set(self, attr, val): setattr(self, attr, val) diff --git a/lib/tf_gp_elev_model.py b/lib/tf_gp_elev_model.py index ad9ebbd..ebaad95 100644 --- a/lib/tf_gp_elev_model.py +++ b/lib/tf_gp_elev_model.py @@ -2,37 +2,11 @@ import numpy as np import math import os +from lib.res_layer import ResLayer os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" -class ResLayer(tf.keras.layers.Layer): - def __init__(self): - super(ResLayer, self).__init__() - self.w1 = tf.keras.layers.Dense( - 256, - activation="relu", - kernel_initializer="he_normal" - ) - self.w2 = tf.keras.layers.Dense( - 256, - activation="relu", - kernel_initializer="he_normal" - ) - self.dropout = tf.keras.layers.Dropout(rate=0.5) - self.add = tf.keras.layers.Add() - - def call(self, inputs): - x = self.w1(inputs) - x = self.dropout(x) - x = self.w2(x) - x = self.add([x, inputs]) - return x - - def get_config(self): - return {} - - class TFGeoPriorModelElev: def __init__(self, model_path): @@ -48,26 +22,9 @@ def __init__(self, model_path): ) def predict(self, latitude, longitude, elevation): - norm_lat = latitude / 90.0 - norm_lng = longitude / 180.0 - norm_loc = tf.stack([norm_lng, norm_lat]) - - if elevation > 0: - norm_elev = elevation / 6574 - elif elevation == 0: - norm_elev = 0.0 - else: - norm_elev = elevation / 32768 - - norm_elev = tf.expand_dims(norm_elev, axis=0) - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi), - norm_elev - ], axis=0) - + encoded_loc = TFGeoPriorModelElev.encode_loc([latitude], [longitude], [elevation]) return self.gpmodel(tf.convert_to_tensor( - tf.expand_dims(encoded_loc, axis=0) + tf.expand_dims(encoded_loc[0], axis=0) ), training=False)[0] def features_for_one_class_elevation(self, latitude, longitude, elevation): @@ -82,36 +39,8 @@ def features_for_one_class_elevation(self, latitude, longitude, elevation): Returns: numpy array: scores for class of interest at each location """ - def encode_loc(latitude, longitude, elevation): - latitude = np.array(latitude) - longitude = np.array(longitude) - elevation = np.array(elevation) - elevation = elevation.astype("float32") - grid_lon = longitude.astype('float32') / 180.0 - grid_lat = latitude.astype('float32') / 90.0 - - elevation[elevation > 0] = elevation[elevation > 0] / 6574.0 - elevation[elevation < 0] = elevation[elevation < 0] / 32768.0 - norm_elev = elevation - - if np.isscalar(grid_lon): - grid_lon = np.array([grid_lon]) - if np.isscalar(grid_lat): - grid_lat = np.array([grid_lat]) - if np.isscalar(norm_elev): - norm_elev = np.array([norm_elev]) - - norm_loc = tf.stack([grid_lon, grid_lat], axis=1) - - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi), - tf.expand_dims(norm_elev, axis=1), - - ], axis=1) - return encoded_loc - - encoded_loc = encode_loc(latitude, longitude, elevation) + + encoded_loc = TFGeoPriorModelElev.encode_loc(latitude, longitude, elevation) loc_emb = self.gpmodel.layers[0](encoded_loc) # res layers - feature extraction @@ -131,3 +60,33 @@ def eval_one_class_elevation_from_features(self, features, class_of_interest): transpose_b=True ) ).numpy() + + @staticmethod + def encode_loc(latitude, longitude, elevation): + latitude = np.array(latitude) + longitude = np.array(longitude) + elevation = np.array(elevation) + elevation = elevation.astype("float32") + grid_lon = longitude.astype("float32") / 180.0 + grid_lat = latitude.astype("float32") / 90.0 + + elevation[elevation > 0] = elevation[elevation > 0] / 6574.0 + elevation[elevation < 0] = elevation[elevation < 0] / 32768.0 + norm_elev = elevation + + # if np.isscalar(grid_lon): + # grid_lon = np.array([grid_lon]) + # if np.isscalar(grid_lat): + # grid_lat = np.array([grid_lat]) + # if np.isscalar(norm_elev): + # norm_elev = np.array([norm_elev]) + + norm_loc = tf.stack([grid_lon, grid_lat], axis=1) + + encoded_loc = tf.concat([ + tf.sin(norm_loc * math.pi), + tf.cos(norm_loc * math.pi), + tf.expand_dims(norm_elev, axis=1), + + ], axis=1) + return encoded_loc diff --git a/lib/tf_gp_model.py b/lib/tf_gp_model.py deleted file mode 100644 index 1e12ade..0000000 --- a/lib/tf_gp_model.py +++ /dev/null @@ -1,96 +0,0 @@ -import tensorflow as tf -import numpy as np -import math -import os - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -class ResLayer(tf.keras.layers.Layer): - def __init__(self): - super(ResLayer, self).__init__() - self.w1 = tf.keras.layers.Dense( - 256, - activation="relu", - kernel_initializer="he_normal" - ) - self.w2 = tf.keras.layers.Dense( - 256, - activation="relu", - kernel_initializer="he_normal" - ) - self.dropout = tf.keras.layers.Dropout(rate=0.5) - self.add = tf.keras.layers.Add() - - def call(self, inputs): - x = self.w1(inputs) - x = self.dropout(x) - x = self.w2(x) - x = self.add([x, inputs]) - return x - - def get_config(self): - return {} - - -class TFGeoPriorModel: - - def __init__(self, model_path, taxonomy): - self.taxonomy = taxonomy - # initialize the geo model for inference - self.gpmodel = tf.keras.models.load_model( - model_path, - custom_objects={'ResLayer': ResLayer}, - compile=False - ) - - def predict(self, latitude, longitude): - norm_lat = np.array([float(latitude)]) / 90.0 - norm_lng = np.array([float(longitude)]) / 180.0 - norm_loc = tf.stack([norm_lng, norm_lat], axis=1) - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi) - ], axis=1) - return self.gpmodel.predict([encoded_loc], verbose=0)[0] - - def eval_one_class(self, latitude, longitude, class_of_interest): - """Evalutes the model for a single class and multiple locations - - Args: - latitude (list): A list of latitudes - longitude (list): A list of longitudes (same length as latitude) - class_of_interest (int): The single class to eval - - Returns: - numpy array: scores for class of interest at each location - """ - def encode_loc(latitude, longitude): - latitude = np.array(latitude) - longitude = np.array(longitude) - grid_lon = longitude.astype('float32') / 180.0 - grid_lat = latitude.astype('float32') / 90.0 - norm_loc = tf.stack([grid_lon, grid_lat], axis=1) - encoded_loc = tf.concat([ - tf.sin(norm_loc * math.pi), - tf.cos(norm_loc * math.pi) - ], axis=1) - return encoded_loc - - encoded_loc = encode_loc(latitude, longitude) - loc_emb = self.gpmodel.layers[0](encoded_loc) - - # res layers - feature extraction - x = self.gpmodel.layers[1](loc_emb) - x = self.gpmodel.layers[2](x) - x = self.gpmodel.layers[3](x) - x = self.gpmodel.layers[4](x) - - # process just the one class - return tf.keras.activations.sigmoid( - tf.matmul( - x, - tf.expand_dims(self.gpmodel.layers[5].weights[0][:,class_of_interest], axis=0), - transpose_b=True - ) - ).numpy() diff --git a/lib/vision_inferrer.py b/lib/vision_inferrer.py index 938c2e6..f1c7482 100644 --- a/lib/vision_inferrer.py +++ b/lib/vision_inferrer.py @@ -1,14 +1,10 @@ import tensorflow as tf -import os -import hashlib -import pickle class VisionInferrer: - def __init__(self, model_path, taxonomy): + def __init__(self, model_path): self.model_path = model_path - self.taxonomy = taxonomy self.prepare_tf_model() # initialize the TF model given the configured path @@ -21,32 +17,6 @@ def prepare_tf_model(self): self.vision_model = tf.keras.models.load_model(self.model_path, compile=False) - # given a unique key, generate a path where vision results can be cached - def cache_path_for_request(self, cache_key): - if cache_key: - cache_hash = hashlib.md5(cache_key.encode()).hexdigest() - return os.path.join("./lib", "vision_cache", cache_hash) - - # given a path, return vision results cached at that path - def cached_results(self, cache_path): - if cache_path and os.path.exists(cache_path): - with open(cache_path, "rb") as handle: - results = pickle.loads(handle.read()) - return results - - # given a path, cache vision results in a file at that path - def cache_results(self, cache_path, results): - if cache_path is not None: - with open(cache_path, "wb+") as cache_file: - pickle.dump(results, cache_file) - - # only return results for up to 500 taxa, or until the scores are very low, whichever - # comes first - # NOTE: This may not be helpful and needs testing for accuracy and processing time - def results_fully_populated(self, results, score): - number_of_results = len(results) - return (number_of_results >= 500 and score < 0.00000001) or number_of_results >= 5000 - # given an image object (usually coming from prepare_image_for_inference), # calculate vision results for the image def process_image(self, image): diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..0d519d7 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +testpaths = tests +filterwarnings = + ignore:.*ml_dtypes.*:DeprecationWarning diff --git a/requirements.txt b/requirements.txt index a0cd337..a4d3300 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +coverage flake8 Flask Flask-WTF @@ -11,6 +12,9 @@ numpy pandas Pillow prison +pytest +pytest-cov +pytest-mock python-magic pyyaml scikit_learn diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..7a5660a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,39 @@ +import pytest +import os +from unittest.mock import MagicMock +from lib.inat_inferrer import InatInferrer +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe + + +@pytest.fixture() +def taxonomy(): + yield ModelTaxonomyDataframe( + os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv"), + os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv") + ) + + +@pytest.fixture() +def taxon(request, taxonomy): + results = taxonomy.df.query(f'name == "{request.param}"') + yield results.iloc[0] + + +@pytest.fixture() +def inatInferrer(request, mocker): + config = { + "vision_model_path": "vision_model_path", + "tf_geo_elevation_model_path": "tf_geo_elevation_model_path", + "taxonomy_path": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv"), + "elevation_h3_r4": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/elevation.csv"), + "tf_elev_thresholds": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/thresholds.csv"), + "taxon_ranges_path": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxon_ranges"), + "synonyms_path": + os.path.realpath(os.path.dirname(__file__) + "/fixtures/synonyms.csv") + } + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + return InatInferrer(config) diff --git a/tests/fixtures/elevation.csv b/tests/fixtures/elevation.csv new file mode 100644 index 0000000..d4a36f8 --- /dev/null +++ b/tests/fixtures/elevation.csv @@ -0,0 +1,2 @@ +h3_04,elevation +842a339ffffffff,10 \ No newline at end of file diff --git a/tests/fixtures/lamprocapnos_spectabilis.jpeg b/tests/fixtures/lamprocapnos_spectabilis.jpeg new file mode 100644 index 0000000..ecd5cce Binary files /dev/null and b/tests/fixtures/lamprocapnos_spectabilis.jpeg differ diff --git a/tests/fixtures/synonyms.csv b/tests/fixtures/synonyms.csv new file mode 100644 index 0000000..2b83ef1 --- /dev/null +++ b/tests/fixtures/synonyms.csv @@ -0,0 +1,2 @@ +taxon_id,synonym_id,synonym_rank_level,synonym_name +979668,979668,10.0,Corvus enca diff --git a/tests/fixtures/taxon_ranges/7.csv b/tests/fixtures/taxon_ranges/7.csv new file mode 100644 index 0000000..3353fc6 --- /dev/null +++ b/tests/fixtures/taxon_ranges/7.csv @@ -0,0 +1,8 @@ +84441a5ffffffff +8444a91ffffffff +8444a93ffffffff +8444a95ffffffff +8444a99ffffffff +8444a9bffffffff +8444a9dffffffff +8444ad3ffffffff diff --git a/tests/fixtures/taxonomy.csv b/tests/fixtures/taxonomy.csv new file mode 100644 index 0000000..38503c6 --- /dev/null +++ b/tests/fixtures/taxonomy.csv @@ -0,0 +1,21 @@ +parent_taxon_id,taxon_id,rank_level,leaf_class_id,iconic_class_id,spatial_class_id,name +,1,70,,,1,Animalia +1,2,60,,,2,Chordata +2,355675,57,,,3,Vertebrata +355675,3,50,,,4,Aves +3,4,40,,,5,Gruiformes +4,5,30,,,6,Aramidae +5,6,20,,,7,Aramus +6,7,10,1,1,8,Aramus guarauna +3,71262,40,,,9,Cariamiformes +71262,12,30,,,10,Cariamidae +12,13,20,,,11,Cariama +13,14,10,2,1,12,Cariama cristata +,47126,70,,,13,Plantae +47126,211194,60,,,14,Tracheophyta +211194,47125,57,,,15,Angiospermae +47125,47124,50,,,16,Magnoliopsida +47124,71289,40,,,17,Saxifragales +71289,47131,30,,,18,Grossulariaceae +47131,47130,20,,,19,Ribes +47130,47129,10,3,2,20,Ribes californicum diff --git a/tests/fixtures/thresholds.csv b/tests/fixtures/thresholds.csv new file mode 100644 index 0000000..ceaa228 --- /dev/null +++ b/tests/fixtures/thresholds.csv @@ -0,0 +1,3 @@ +,taxon_id,thres,area +0,7,0.1,1000000.1 +1,14,0.2,1000000.1 diff --git a/tests/test_inat_inferrer.py b/tests/test_inat_inferrer.py new file mode 100644 index 0000000..9ba1a1c --- /dev/null +++ b/tests/test_inat_inferrer.py @@ -0,0 +1,106 @@ +import tensorflow as tf +import pandas as pd +import os +import pytest +from unittest.mock import MagicMock +from lib.res_layer import ResLayer +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe + + +class TestInatInferrer: + def test_initialization(self, inatInferrer): + assert isinstance(inatInferrer.taxonomy, ModelTaxonomyDataframe) + assert isinstance(inatInferrer.synonyms, pd.DataFrame) + assert isinstance(inatInferrer.geo_elevation_cells, pd.DataFrame) + tf.keras.models.load_model.assert_any_call( + inatInferrer.config["vision_model_path"], + compile=False + ) + tf.keras.models.load_model.assert_any_call( + inatInferrer.config["tf_geo_elevation_model_path"], + custom_objects={'ResLayer': ResLayer}, + compile=False + ) + + def test_predictions_for_image(self, inatInferrer): + test_image_path = \ + os.path.realpath(os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg") + scores = inatInferrer.predictions_for_image( + file_path=test_image_path, + lat=42, + lng=-71, + filter_taxon=None, + score_without_geo=False, + debug=True + ) + assert isinstance(scores, pd.DataFrame) + assert "leaf_class_id" in scores.columns + assert "parent_taxon_id" in scores.columns + assert "taxon_id" in scores.columns + assert "rank_level" in scores.columns + assert "iconic_class_id" in scores.columns + assert "vision_score" in scores.columns + assert "geo_score" in scores.columns + assert "normalized_vision_score" in scores.columns + assert "normalized_geo_score" in scores.columns + assert "combined_score" in scores.columns + assert "geo_threshold" in scores.columns + + def test_geo_model_predict_with_no_location(self, inatInferrer): + assert inatInferrer.geo_model_predict(lat=None, lng=None) is None + assert inatInferrer.geo_model_predict(lat="", lng="") is None + + @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) + def test_lookup_taxon(self, inatInferrer, taxon): + assert inatInferrer.lookup_taxon(taxon["taxon_id"])["name"] == taxon["name"] + + def test_lookup_taxon_with_no_taxon(self, inatInferrer): + assert inatInferrer.lookup_taxon(None) is None + + def test_lookup_taxon_with_invalid_taxon(self, inatInferrer): + with pytest.raises(KeyError): + assert inatInferrer.lookup_taxon(999999999) is None + + def test_aggregate_results(self, inatInferrer): + test_image_path = \ + os.path.realpath(os.path.dirname(__file__) + "/fixtures/lamprocapnos_spectabilis.jpeg") + scores = inatInferrer.predictions_for_image( + file_path=test_image_path, + lat=42, + lng=-71, + filter_taxon=None, + score_without_geo=False, + debug=True + ) + scores.normalized_vision_score = 0.5 + scores.normalized_geo_score = 0.5 + scores.combined_score = 0.25 + scores.geo_threshold = 0.001 + aggregated_scores = inatInferrer.aggregate_results( + leaf_scores=scores, + filter_taxon=None, + score_without_geo=False, + debug=True + ) + assert "aggregated_vision_score" in aggregated_scores.columns + assert "aggregated_geo_score" in aggregated_scores.columns + assert "aggregated_geo_threshold" in aggregated_scores.columns + assert "aggregated_combined_score" in aggregated_scores.columns + assert "normalized_aggregated_combined_score" in aggregated_scores.columns + + @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) + def test_h3_04_taxon_range_comparison(self, mocker, inatInferrer, taxon): + inatInferrer.h3_04_geo_results_for_taxon = MagicMock(return_value={ + "aa": "0.1", + "ab": "0.1" + }) + inatInferrer.h3_04_taxon_range = MagicMock(return_value={ + "ab": "0.1", + "bb": "0.1" + }) + range_comparison_results = inatInferrer.h3_04_taxon_range_comparison(taxon["taxon_id"]) + assert range_comparison_results == { + "aa": 0, + "ab": 0.5, + "bb": 1 + } diff --git a/tests/test_model_taxonomy.py b/tests/test_model_taxonomy.py new file mode 100644 index 0000000..167fff7 --- /dev/null +++ b/tests/test_model_taxonomy.py @@ -0,0 +1,54 @@ +import pytest +import os +from lib.model_taxonomy import ModelTaxonomy + + +@pytest.fixture() +def taxonomy(): + yield ModelTaxonomy( + os.path.realpath(os.path.dirname(__file__) + "/fixtures/taxonomy.csv") + ) + + +@pytest.fixture() +def taxon(request, taxonomy): + yield next(v for k, v in taxonomy.taxa.items() if v.name == request.param) + + +class TestModelTaxonomyDataframe: + def test_raise_error_on_missing_path(self): + with pytest.raises(FileNotFoundError): + ModelTaxonomy( + os.path.realpath("nonsense") + ) + + @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) + def test_loading_mapping(self, taxon): + assert taxon.id == 7 + assert taxon.parent_id == 6 + assert taxon.rank_level == 10 + assert taxon.leaf_class_id == 1 + assert taxon.name == "Aramus guarauna" + + @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) + def test_nested_set_assigning(self, taxon): + assert taxon.left == 7 + assert taxon.right == 8 + + def test_children_of_root(self, taxonomy): + children = taxonomy.taxon_children[0] + assert len(children) == 2 + assert taxonomy.taxa[children[0]].name == "Animalia" + assert taxonomy.taxa[children[1]].name == "Plantae" + + @pytest.mark.parametrize("taxon", ["Animalia"], indirect=True) + def test_children_of_taxon(self, taxonomy, taxon): + children = taxonomy.taxon_children[taxon.id] + assert len(children) == 1 + assert taxonomy.taxa[children[0]].name == "Chordata" + + def test_print(self, capsys, taxonomy): + taxonomy.print() + captured = capsys.readouterr() + assert "├──Animalia :: 0:23" in captured.out + assert "│ └──Chordata :: 1:22" in captured.out diff --git a/tests/test_model_taxonomy_dataframe.py b/tests/test_model_taxonomy_dataframe.py new file mode 100644 index 0000000..15d8d37 --- /dev/null +++ b/tests/test_model_taxonomy_dataframe.py @@ -0,0 +1,56 @@ +import pytest +from lib.model_taxonomy_dataframe import ModelTaxonomyDataframe + + +class TestModelTaxonomyDataframe: + @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) + def test_loading_mapping(self, taxon): + assert taxon["taxon_id"] == 7 + assert taxon["parent_taxon_id"] == 6 + assert taxon["rank_level"] == 10 + assert taxon["leaf_class_id"] == 1 + assert taxon["iconic_class_id"] == 1 + assert taxon["spatial_class_id"] == 8 + assert taxon["name"] == "Aramus guarauna" + assert taxon["geo_threshold"] == 0.1 + + @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) + def test_nested_set_assigning(self, taxon): + assert taxon["left"] == 7 + assert taxon["right"] == 8 + + @pytest.mark.parametrize("taxon", ["Aramus guarauna"], indirect=True) + def test_geo_threshold_assigning(self, taxon): + assert taxon["geo_threshold"] == 0.1 + + def test_children_of_root(self, taxonomy): + children = ModelTaxonomyDataframe.children(taxonomy.df, 0) + assert len(children.index) == 2 + assert children.iloc[0]["name"] == "Animalia" + assert children.iloc[1]["name"] == "Plantae" + + @pytest.mark.parametrize("taxon", ["Animalia"], indirect=True) + def test_children_of_taxon(self, taxonomy, taxon): + children = ModelTaxonomyDataframe.children(taxonomy.df, taxon["taxon_id"]) + assert len(children.index) == 1 + assert children.iloc[0]["name"] == "Chordata" + + def test_print(self, capsys, taxonomy): + ModelTaxonomyDataframe.print(taxonomy.df) + captured = capsys.readouterr() + assert "├──Animalia :: 0:23" in captured.out + assert "│ └──Chordata :: 1:22" in captured.out + + def test_print_with_aggregated_combined_score(self, capsys, taxonomy): + taxonomy.df["aggregated_combined_score"] = 1 + ModelTaxonomyDataframe.print(taxonomy.df) + captured = capsys.readouterr() + assert "├──Animalia :: 0:23" in captured.out + assert "│ └──Chordata :: 1:22" in captured.out + + def test_print_with_lambda(self, capsys, taxonomy): + ModelTaxonomyDataframe.print(taxonomy.df, display_taxon_lambda=( + lambda row: "customformat" + )) + captured = capsys.readouterr() + assert "customformat" in captured.out diff --git a/tests/test_res_layer.py b/tests/test_res_layer.py new file mode 100644 index 0000000..f803472 --- /dev/null +++ b/tests/test_res_layer.py @@ -0,0 +1,31 @@ +import tensorflow as tf +import unittest.mock as mock +from lib.res_layer import ResLayer +from unittest.mock import MagicMock + + +class TestResLayer: + def test_initialization(self): + res_layer = ResLayer() + assert isinstance(res_layer.w1, tf.keras.layers.Dense) + assert isinstance(res_layer.w2, tf.keras.layers.Dense) + assert isinstance(res_layer.dropout, tf.keras.layers.Dropout) + assert isinstance(res_layer.add, tf.keras.layers.Add) + + def test_call(self, mocker): + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + res_layer = ResLayer() + inputs = tf.keras.Input((256,)) + res_layer.call(inputs) + call_w1 = mock.create_autospec(res_layer.w1.call) + call_dropout = mock.create_autospec(res_layer.dropout.call) + call_w2 = mock.create_autospec(res_layer.w1.call) + call_add = mock.create_autospec(res_layer.add.call) + call_w1.assert_called_once + call_dropout.assert_called_once + call_w2.assert_called_once + call_add.assert_called_once + + def test_get_config(self): + res_layer = ResLayer() + assert res_layer.get_config() == {} diff --git a/tests/test_taxon.py b/tests/test_taxon.py new file mode 100644 index 0000000..171cd48 --- /dev/null +++ b/tests/test_taxon.py @@ -0,0 +1,17 @@ +from lib.taxon import Taxon + + +class TestTaxon: + def test_initialization(self): + taxon = Taxon({"id": 0, "name": "Life"}) + assert taxon.name == "Life" + + def test_is_or_descendant_of_self(self): + taxon = Taxon({"id": 1}) + assert taxon.is_or_descendant_of(taxon) + + def test_is_or_descendant_of_taxon(self): + parent_taxon = Taxon({"id": 1, "left": 0, "right": 3}) + child_taxon = Taxon({"id": 2, "left": 1, "right": 2}) + assert child_taxon.is_or_descendant_of(parent_taxon) + assert not parent_taxon.is_or_descendant_of(child_taxon) diff --git a/tests/test_tf_gp_elev_model.py b/tests/test_tf_gp_elev_model.py new file mode 100644 index 0000000..65f0dae --- /dev/null +++ b/tests/test_tf_gp_elev_model.py @@ -0,0 +1,43 @@ +import pytest +import tensorflow as tf +from lib.res_layer import ResLayer +from lib.tf_gp_elev_model import TFGeoPriorModelElev +from unittest.mock import MagicMock + + +class TestTfGpModel: + def test_initialization_with_unknown_model_path(self): + with pytest.raises(OSError): + TFGeoPriorModelElev("model_path") + + def test_initialization(self, mocker): + model_path = "model_path" + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + TFGeoPriorModelElev(model_path) + tf.keras.models.load_model.assert_called_once_with( + model_path, + custom_objects={'ResLayer': ResLayer}, + compile=False + ) + + def test_predict(self, mocker): + model_path = "model_path" + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + tf_gp_model = TFGeoPriorModelElev(model_path) + tf_gp_model.predict(0, 0, 0) + + def test_features_for_one_class_elevation(self, mocker): + model_path = "model_path" + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + tf_gp_model = TFGeoPriorModelElev(model_path) + tf_gp_model.features_for_one_class_elevation([0], [0], [0]) + + def test_eval_one_class_elevation_from_features(self, mocker): + model_path = "model_path" + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + mocker.patch("tensorflow.keras.activations.sigmoid", return_value=MagicMock()) + mocker.patch("tensorflow.matmul", return_value=MagicMock()) + mocker.patch("tensorflow.expand_dims", return_value=MagicMock()) + tf_gp_model = TFGeoPriorModelElev(model_path) + tf_gp_model.eval_one_class_elevation_from_features("features", "class_of_interest") + tf.keras.activations.sigmoid.assert_called_once diff --git a/tests/test_vision_inferrer.py b/tests/test_vision_inferrer.py new file mode 100644 index 0000000..18830bf --- /dev/null +++ b/tests/test_vision_inferrer.py @@ -0,0 +1,26 @@ +import tensorflow as tf +from unittest.mock import MagicMock +from lib.vision_inferrer import VisionInferrer + + +class TestVisionInferrer: + def test_initialization(self, mocker): + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + model_path = "model_path" + inferrer = VisionInferrer(model_path) + assert inferrer.model_path == model_path + tf.keras.models.load_model.assert_called_once_with( + model_path, + compile=False + ) + + def test_process_image(self, mocker): + mocker.patch("tensorflow.keras.models.load_model", return_value=MagicMock()) + model_path = "model_path" + inferrer = VisionInferrer(model_path) + theimage = "theimage" + inferrer.process_image(theimage) + inferrer.vision_model.assert_called_once_with( + tf.convert_to_tensor(theimage), + training=False + )