Merge pull request #5 from NERC-CEH/more_data

Proof of concept of similarity search with the scivision model
NERC-CEH · Jul 22, 2024 · 0c34d98 · 0c34d98
2 parents f80abf4 + b2d2aa1
commit 0c34d98
Show file tree

Hide file tree

Showing 58 changed files with 1,567 additions and 2,559 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length=120
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -17,5 +17,5 @@ jobs:
         uses: py-actions/flake8@v2
         with:
             max-line-length: "120"
-            path: "cyto_ml"
+            path: cyto_ml
             plugins: "flake8-bugbear==22.1.11 flake8-black"
diff --git a/.github/workflows/pytest_coverage.yml b/.github/workflows/pytest_coverage.yml
@@ -24,7 +24,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
           auto-activate-base: false
       - run: pip install pytest-cov
-      - run: python -m pytest --cov=cyto_ml --cov-report xml:coverage.xml tests/
+      - run: python -m pytest --cov=cyto_ml --cov-report xml:coverage.xml
       - uses: actions/upload-artifact@v4
         with:
           name: coverage.xml

diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .env
 **/.ipynb_checkpoints/
 **/__pycache__/
+vectors/
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ conda create -n cyto_39 python=3.9
 conda env update
 ```
 
-Please note that this is specifically pinned to python 3.9 due to dependency versions; we make experimental use of the [https://sci.vision/#/model/resnet50-plankton](CEFAS plankton model available through SciVision), which in turn uses an older version of pytorch that isn't packaged above python 3.9.
+Please note that this is specifically pinned to python 3.9 due to dependency versions; we make experimental use of the [CEFAS plankton model available through SciVision](https://sci.vision/#/model/resnet50-plankton), which in turn uses an older version of pytorch that isn't packaged above python 3.9.
 
 ### Object store connection
 
@@ -40,7 +40,7 @@ Get started by cloning this repository and running
 
 ### Feature extraction
 
-Experiment testing workflows by using [https://sci.vision/#/model/resnet50-plankton](this plankton model from SciVision) to extract features from images for use in similarity search, clustering, etc. 
+Experiment testing workflows by using [this plankton model from SciVision](https://sci.vision/#/model/resnet50-plankton) to extract features from images for use in similarity search, clustering, etc.
 
 ### TBC (object store upload, derived classifiers, etc)
 

diff --git a/cyto_ml/data/intake.py b/cyto_ml/data/intake.py
@@ -0,0 +1,31 @@
+"""Utilities for expressing our dataset as an intake catalog"""
+
+
+def intake_yaml(
+    test_url: str,
+    catalog_url: str,
+):
+    """
+    Write a minimal YAML template describing this as an intake datasource
+    Example: plankton dataset made available through scivision, metadata
+    https://raw.githubusercontent.com/alan-turing-institute/plankton-cefas-scivision/test_data_catalog/scivision.yml
+    See the comments below for decisions about its structure
+    """
+    template = f"""
+sources:
+  test_image:
+    description: Single test image from the plankton collection
+    origin:
+    driver: intake_xarray.image.ImageSource
+    args:
+      urlpath: ["{test_url}"]
+      exif_tags: False
+  plankton:
+    description: A CSV index of all the images of plankton
+    origin:
+    driver: intake.source.csv.CSVSource
+    args:
+      urlpath: ["{catalog_url}"]
+"""
+    # coerce_shape: [256, 256]
+    return template
diff --git a/cyto_ml/data/s3.py b/cyto_ml/data/s3.py
@@ -0,0 +1,20 @@
+"""Thin wrapper around the s3 object store with images and metadata"""
+
+import s3fs
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+
+
+def s3_endpoint():
+    """Return a reference to the object store,
+    reading the credentials set in the environment.
+    """
+    fs = s3fs.S3FileSystem(
+        anon=False,
+        key=os.environ.get("FSSPEC_S3_KEY", ""),
+        secret=os.environ.get("FSSPEC_S3_SECRET", ""),
+        client_kwargs={"endpoint_url": os.environ["ENDPOINT"]},
+    )
+    return fs
diff --git a/cyto_ml/data/vectorstore.py b/cyto_ml/data/vectorstore.py
@@ -1,11 +1,22 @@
-import chromadb
-from chromadb.db.base import UniqueConstraintError
+import os
 from typing import Optional
 import logging
 
+import chromadb
+from chromadb.db.base import UniqueConstraintError
+from chromadb.config import Settings
+
+
 logging.basicConfig(level=logging.INFO)
+# TODO make this sensibly configurable, not confusingly hardcoded
+STORE = os.path.join(os.path.abspath(os.path.dirname(__file__)), "../../vectors")
 
-client = chromadb.PersistentClient(path="./vectors")
+client = chromadb.PersistentClient(
+    path=STORE,
+    settings=Settings(
+        anonymized_telemetry=False,
+    ),
+)
 
 
 def vector_store(name: Optional[str] = "test_collection"):

diff --git a/cyto_ml/models/scivision.py b/cyto_ml/models/scivision.py
@@ -49,3 +49,9 @@ def prepare_image(image: DataArray):
         tensor_image = tensor_image.cuda()
 
     return tensor_image
+
+
+def flat_embeddings(features: torch.Tensor):
+    """Utility function that takes the features returned by the model in truncate_model
+    And flattens them into a list suitable for storing in a vector database"""
+    return list(features[0].squeeze(1).squeeze(1).detach().numpy().astype(float))
diff --git a/tests/conftest.py → cyto_ml/tests/conftest.py b/tests/conftest.py → cyto_ml/tests/conftest.py
@@ -1,6 +1,10 @@
 import os
 import pytest
-
+from cyto_ml.models.scivision import (
+    load_model,
+    truncate_model,
+    SCIVISION_URL,
+)
 
 
 @pytest.fixture
@@ -22,3 +26,8 @@ def single_image(image_dir):
 @pytest.fixture
 def image_batch(image_dir):
     return os.path.join(image_dir, "testymctestface_*.tif")
+
+
+@pytest.fixture
+def scivision_model():
+    return truncate_model(load_model(SCIVISION_URL))
diff --git a/...ures/test_images/testymctestface_1091.tif → ...ures/test_images/testymctestface_1091.tif b/...ures/test_images/testymctestface_1091.tif → ...ures/test_images/testymctestface_1091.tif
diff --git a/...tures/test_images/testymctestface_113.tif → ...tures/test_images/testymctestface_113.tif b/...tures/test_images/testymctestface_113.tif → ...tures/test_images/testymctestface_113.tif
diff --git a/...tures/test_images/testymctestface_127.tif → ...tures/test_images/testymctestface_127.tif b/...tures/test_images/testymctestface_127.tif → ...tures/test_images/testymctestface_127.tif
diff --git a/...tures/test_images/testymctestface_133.tif → ...tures/test_images/testymctestface_133.tif b/...tures/test_images/testymctestface_133.tif → ...tures/test_images/testymctestface_133.tif
diff --git a/...ures/test_images/testymctestface_1388.tif → ...ures/test_images/testymctestface_1388.tif b/...ures/test_images/testymctestface_1388.tif → ...ures/test_images/testymctestface_1388.tif
diff --git a/...ures/test_images/testymctestface_1407.tif → ...ures/test_images/testymctestface_1407.tif b/...ures/test_images/testymctestface_1407.tif → ...ures/test_images/testymctestface_1407.tif
diff --git a/...ures/test_images/testymctestface_1830.tif → ...ures/test_images/testymctestface_1830.tif b/...ures/test_images/testymctestface_1830.tif → ...ures/test_images/testymctestface_1830.tif
diff --git a/...ures/test_images/testymctestface_1876.tif → ...ures/test_images/testymctestface_1876.tif b/...ures/test_images/testymctestface_1876.tif → ...ures/test_images/testymctestface_1876.tif
diff --git a/...tures/test_images/testymctestface_188.tif → ...tures/test_images/testymctestface_188.tif b/...tures/test_images/testymctestface_188.tif → ...tures/test_images/testymctestface_188.tif
diff --git a/...ures/test_images/testymctestface_1887.tif → ...ures/test_images/testymctestface_1887.tif b/...ures/test_images/testymctestface_1887.tif → ...ures/test_images/testymctestface_1887.tif
diff --git a/...ures/test_images/testymctestface_1890.tif → ...ures/test_images/testymctestface_1890.tif b/...ures/test_images/testymctestface_1890.tif → ...ures/test_images/testymctestface_1890.tif
diff --git a/...ures/test_images/testymctestface_1892.tif → ...ures/test_images/testymctestface_1892.tif b/...ures/test_images/testymctestface_1892.tif → ...ures/test_images/testymctestface_1892.tif
diff --git a/...ures/test_images/testymctestface_1901.tif → ...ures/test_images/testymctestface_1901.tif b/...ures/test_images/testymctestface_1901.tif → ...ures/test_images/testymctestface_1901.tif
diff --git a/...ures/test_images/testymctestface_1909.tif → ...ures/test_images/testymctestface_1909.tif b/...ures/test_images/testymctestface_1909.tif → ...ures/test_images/testymctestface_1909.tif
diff --git a/...ures/test_images/testymctestface_1912.tif → ...ures/test_images/testymctestface_1912.tif b/...ures/test_images/testymctestface_1912.tif → ...ures/test_images/testymctestface_1912.tif
diff --git a/...ures/test_images/testymctestface_1914.tif → ...ures/test_images/testymctestface_1914.tif b/...ures/test_images/testymctestface_1914.tif → ...ures/test_images/testymctestface_1914.tif
diff --git a/...ures/test_images/testymctestface_1915.tif → ...ures/test_images/testymctestface_1915.tif b/...ures/test_images/testymctestface_1915.tif → ...ures/test_images/testymctestface_1915.tif
diff --git a/...ures/test_images/testymctestface_1919.tif → ...ures/test_images/testymctestface_1919.tif b/...ures/test_images/testymctestface_1919.tif → ...ures/test_images/testymctestface_1919.tif
diff --git a/...ures/test_images/testymctestface_1922.tif → ...ures/test_images/testymctestface_1922.tif b/...ures/test_images/testymctestface_1922.tif → ...ures/test_images/testymctestface_1922.tif
diff --git a/...ures/test_images/testymctestface_1924.tif → ...ures/test_images/testymctestface_1924.tif b/...ures/test_images/testymctestface_1924.tif → ...ures/test_images/testymctestface_1924.tif
diff --git a/...ures/test_images/testymctestface_1948.tif → ...ures/test_images/testymctestface_1948.tif b/...ures/test_images/testymctestface_1948.tif → ...ures/test_images/testymctestface_1948.tif
diff --git a/...ures/test_images/testymctestface_1953.tif → ...ures/test_images/testymctestface_1953.tif b/...ures/test_images/testymctestface_1953.tif → ...ures/test_images/testymctestface_1953.tif
diff --git a/...ures/test_images/testymctestface_1962.tif → ...ures/test_images/testymctestface_1962.tif b/...ures/test_images/testymctestface_1962.tif → ...ures/test_images/testymctestface_1962.tif
diff --git a/...ures/test_images/testymctestface_1965.tif → ...ures/test_images/testymctestface_1965.tif b/...ures/test_images/testymctestface_1965.tif → ...ures/test_images/testymctestface_1965.tif
diff --git a/...ures/test_images/testymctestface_1981.tif → ...ures/test_images/testymctestface_1981.tif b/...ures/test_images/testymctestface_1981.tif → ...ures/test_images/testymctestface_1981.tif
diff --git a/...ures/test_images/testymctestface_2012.tif → ...ures/test_images/testymctestface_2012.tif b/...ures/test_images/testymctestface_2012.tif → ...ures/test_images/testymctestface_2012.tif
diff --git a/...ures/test_images/testymctestface_2071.tif → ...ures/test_images/testymctestface_2071.tif b/...ures/test_images/testymctestface_2071.tif → ...ures/test_images/testymctestface_2071.tif
diff --git a/...ures/test_images/testymctestface_2102.tif → ...ures/test_images/testymctestface_2102.tif b/...ures/test_images/testymctestface_2102.tif → ...ures/test_images/testymctestface_2102.tif
diff --git a/...ures/test_images/testymctestface_2108.tif → ...ures/test_images/testymctestface_2108.tif b/...ures/test_images/testymctestface_2108.tif → ...ures/test_images/testymctestface_2108.tif
diff --git a/...ures/test_images/testymctestface_2110.tif → ...ures/test_images/testymctestface_2110.tif b/...ures/test_images/testymctestface_2110.tif → ...ures/test_images/testymctestface_2110.tif
diff --git a/...ures/test_images/testymctestface_2115.tif → ...ures/test_images/testymctestface_2115.tif b/...ures/test_images/testymctestface_2115.tif → ...ures/test_images/testymctestface_2115.tif
diff --git a/...ures/test_images/testymctestface_2117.tif → ...ures/test_images/testymctestface_2117.tif b/...ures/test_images/testymctestface_2117.tif → ...ures/test_images/testymctestface_2117.tif
diff --git a/...ures/test_images/testymctestface_2119.tif → ...ures/test_images/testymctestface_2119.tif b/...ures/test_images/testymctestface_2119.tif → ...ures/test_images/testymctestface_2119.tif
diff --git a/...ures/test_images/testymctestface_2172.tif → ...ures/test_images/testymctestface_2172.tif b/...ures/test_images/testymctestface_2172.tif → ...ures/test_images/testymctestface_2172.tif
diff --git a/...ures/test_images/testymctestface_2715.tif → ...ures/test_images/testymctestface_2715.tif b/...ures/test_images/testymctestface_2715.tif → ...ures/test_images/testymctestface_2715.tif
diff --git a/...xtures/test_images/testymctestface_36.tif → ...xtures/test_images/testymctestface_36.tif b/...xtures/test_images/testymctestface_36.tif → ...xtures/test_images/testymctestface_36.tif
diff --git a/...ures/test_images/testymctestface_3612.tif → ...ures/test_images/testymctestface_3612.tif b/...ures/test_images/testymctestface_3612.tif → ...ures/test_images/testymctestface_3612.tif
diff --git a/...ures/test_images/testymctestface_3814.tif → ...ures/test_images/testymctestface_3814.tif b/...ures/test_images/testymctestface_3814.tif → ...ures/test_images/testymctestface_3814.tif
diff --git a/...ures/test_images/testymctestface_4715.tif → ...ures/test_images/testymctestface_4715.tif b/...ures/test_images/testymctestface_4715.tif → ...ures/test_images/testymctestface_4715.tif
diff --git a/...ures/test_images/testymctestface_4961.tif → ...ures/test_images/testymctestface_4961.tif b/...ures/test_images/testymctestface_4961.tif → ...ures/test_images/testymctestface_4961.tif
diff --git a/cyto_ml/tests/test_image_embeddings.py b/cyto_ml/tests/test_image_embeddings.py
@@ -0,0 +1,13 @@
+from intake_xarray import ImageSource
+from torch import Tensor
+from cyto_ml.models.scivision import prepare_image, flat_embeddings
+
+
+def test_embeddings(scivision_model, single_image):
+    features = scivision_model(prepare_image(ImageSource(single_image).to_dask()))
+
+    assert isinstance(features, Tensor)
+
+    embeddings = flat_embeddings(features)
+
+    assert len(embeddings) == features.size()[1]
diff --git a/tests/test_prepare_image.py → cyto_ml/tests/test_prepare_image.py b/tests/test_prepare_image.py → cyto_ml/tests/test_prepare_image.py
@@ -1,6 +1,7 @@
 # test_prepare_image.py
 import pytest
 import torch
+import logging
 from intake_xarray import ImageSource
 from cyto_ml.models.scivision import prepare_image
 
@@ -10,7 +11,7 @@
 def test_single_image(single_image):
 
     image_data = ImageSource(single_image).to_dask()
-    # Prepare the image
+    # Tensorise the image (potentially normalise if we have useful values)
     prepared_image = prepare_image(image_data)
 
     # Check if the shape is correct (batch dimension added)
@@ -19,15 +20,17 @@ def test_single_image(single_image):
 
 def test_image_batch(image_batch):
     """
-    Currently expected to fail because dask wants images to share dimensions
+    Currently expected to fail because dask wants images to share dimensions, ours don't
+    Needs digging into the (source) data from the FlowCam that gets decollaged
+    We either pad them (and process a lot of blank space) or stick to single image input
     """
     # Load a batch of plankton images
 
     image_data = ImageSource(image_batch).to_dask()
 
     with pytest.raises(ValueError) as err:
-        prepared_batch = prepare_image(image_data)
-        print(err)
+        _ = prepare_image(image_data)
+        logging.info(err)
     # Check if the shape is correct
     # assert prepared_batch.shape == torch.Size([64, 89, 36, 3])
 

diff --git a/cyto_ml/tests/test_vector_store.py b/cyto_ml/tests/test_vector_store.py
@@ -0,0 +1,21 @@
+from cyto_ml.data.vectorstore import vector_store, client
+import numpy as np
+
+
+def test_client_no_telemetry():
+    assert not client.get_settings()["anonymized_telemetry"]
+
+
+def test_store():
+    store = vector_store()  # default 'test_collection'
+    id = "id_1"  # insists on a str
+    filename = "https://example.com/filename.tif"
+    store.add(
+        documents=[filename],  # we use image location in s3 rather than text content
+        embeddings=[list(np.random.rand(2048))],  # wants a list of lists
+        ids=[id],
+    )  # wants a list of ids
+
+    record = store.get("id_1", include=["embeddings"])
+    assert record
+    assert len(record["embeddings"][0]) == 2048
diff --git a/environment.yml b/environment.yml
@@ -10,10 +10,12 @@ dependencies:
   - dask
   - pip:
     - pytest
+    - imagecodecs
     - intake # for reading scivision
     - torch==1.10.0 # install before cefas_scivision; it needs this version
     - scivision
     - scikit-image
     - setuptools==69.5.1 # because this bug https://github.com/pytorch/serve/issues/3176
+    - tiffile
     - git+https://github.com/alan-turing-institute/plankton-cefas-scivision@main # torch version
     - chromadb