Merge pull request #10 from mrjleo/api

Organize API
mrjleo · Dec 13, 2024 · 9a4f27c · 9a4f27c
2 parents 43db891 + a3104ae
commit 9a4f27c
Show file tree

Hide file tree

Showing 30 changed files with 914 additions and 853 deletions.
diff --git a/README.md b/README.md
@@ -19,7 +19,9 @@ Using a Fast-Forward index is as simple as providing a TREC run with retrieval s
 
 ```python
 from pathlib import Path
-from fast_forward import OnDiskIndex, Mode, Ranking
+
+from fast_forward import Ranking
+from fast_forward.index import OnDiskIndex, Mode
 from fast_forward.encoder import TCTColBERTQueryEncoder
 
 # choose a pre-trained query encoder

diff --git a/src/fast_forward/__init__.py b/src/fast_forward/__init__.py
@@ -1,15 +1,11 @@
-""".. include:: docs/main.md"""  # noqa: D400, D415
+""".. include:: docs/main.md
+.. include:: docs/ranking.md
+"""  # noqa: D205, D400, D415
 
 import importlib.metadata
 
-# in this specific case, the redundant aliases are recommended by pyright
-# ruff: noqa: PLC0414
-from fast_forward.index import Mode as Mode
-from fast_forward.index.disk import OnDiskIndex as OnDiskIndex
-from fast_forward.index.memory import InMemoryIndex as InMemoryIndex
-from fast_forward.indexer import Indexer as Indexer
-from fast_forward.quantizer.nanopq import NanoOPQ as NanoOPQ
-from fast_forward.quantizer.nanopq import NanoPQ as NanoPQ
-from fast_forward.ranking import Ranking as Ranking
+from fast_forward import encoder, index, quantizer, util
+from fast_forward.ranking import Ranking
 
+__all__ = ["encoder", "index", "quantizer", "util", "Ranking"]
 __version__ = importlib.metadata.version("fast-forward-indexes")
diff --git a/src/fast_forward/docs/encoder.md b/src/fast_forward/docs/encoder.md
@@ -1,8 +1,8 @@
 # Custom encoders
 
-Custom encoders can easily be implemented. The preferred way to do this is by subclassing `fast_forward.encoder.Encoder` and overriding the `fast_forward.encoder.Encoder._encode` method. This allows the new encoder to make use of batch encoding.
+Custom encoders can easily be implemented. The preferred way to do this is by subclassing `fast_forward.encoder.base.Encoder` and overriding the `_encode` method. This allows the new encoder to make use of batch encoding.
 
-Alternatively, one can use the `fast_forward.encoder.LambdaEncoder` class, which wraps a function that encodes a single piece of text. The following example shows how to do this with a [Pyserini](https://github.com/castorini/pyserini) query encoder:
+Alternatively, one can use the `LambdaEncoder` class, which wraps a function that encodes a single piece of text. The following example shows how to do this with a [Pyserini](https://github.com/castorini/pyserini) query encoder:
 
 ```python
 pyserini_encoder = pyserini.encode.AnceQueryEncoder("castorini/ance-msmarco-passage")

diff --git a/src/fast_forward/docs/index.md b/src/fast_forward/docs/index.md
@@ -1,17 +1,19 @@
+# Indexes
+
 Indexes are the core of the Fast-Forward library. In the following, you'll find some snippets how to create and use indexes.
 
-# Index types
+## Index types
 
 Currently, two types of indexes are available:
 
-- `fast_forward.index.memory.InMemoryIndex`: Everything is held in memory entirely.
-- `fast_forward.index.disk.OnDiskIndex`: Vectors are stored on disk and accessed on demand.
+- `InMemoryIndex`: Everything is held in memory entirely.
+- `OnDiskIndex`: Vectors are stored on disk and accessed on demand.
 
-OnDiskIndexes can be loaded into memory using `fast_forward.index.disk.OnDiskIndex.to_memory`.
+OnDiskIndexes can be loaded into memory using `OnDiskIndex.to_memory`.
 
-# Creating an index
+## Creating an index
 
-The following snippet illustrates how to create a `fast_forward.index.disk.OnDiskIndex` object (given a `fast_forward.encoder.Encoder`, `my_query_encoder`) and add some vector representations to it:
+The following snippet illustrates how to create an `OnDiskIndex` object (given a `fast_forward.encoder.Encoder`, `my_query_encoder`) and add some vector representations to it:
 
 ```python
 my_index = OnDiskIndex(Path("my_index.h5"), my_query_encoder)
@@ -24,40 +26,40 @@ my_index.add(
 
 Here, `my_vectors` is a Numpy array of shape `(3, 768)`, `768` being the dimensionality of the vector representations. The first two vectors correspond to two passages of the document `d1`, the third vector corresponds to `d2`, which has only a single passage. It is also possible to provide either only document IDs or only passage IDs.
 
-The index can then be subsequently loaded back using `fast_forward.index.disk.OnDiskIndex.load`.
+The index can then be subsequently loaded back using `OnDiskIndex.load`.
 
-# Using an index
+## Using an index
 
-Index can be used to compute semantic re-ranking scores by calling them directly. It requires a `fast_forward.ranking.Ranking` (typically, this comes from a sparse retriever) with the corresponding queries:
+Index can be used to compute semantic re-ranking scores by calling them directly. It requires a `fast_forward.Ranking` (typically, this comes from a sparse retriever) with the corresponding queries:
 
 ```python
 ranking = Ranking.from_file(Path("/path/to/sparse/run.tsv"), queries)
 result = my_index(ranking)
 ```
 
-Here, `queries` is a simple dictionary mapping query IDs to actual queries to be encoded. The resulting ranking, `result`, has the semantic scores for the query-document (or query-passage) pairs. Afterwards, retrieval and re-ranking scores may be combined (see `fast_forward.ranking`).
+Here, `queries` is a simple dictionary mapping query IDs to actual queries to be encoded. The resulting ranking, `result`, has the semantic scores for the query-document (or query-passage) pairs. Afterwards, retrieval and re-ranking scores may be combined (see [Rankings](../fast_forward.html#rankings)).
 
 If the input ranking has a large number of queries, one can use the `batch_size` parameter. The following example processes the ranking in batches of `16` queries each:
 
 ```python
 result = my_index(ranking, batch_size=16)
 ```
 
-## Ranking mode
+### Ranking mode
 
-Each index has a ranking mode (`fast_forward.index.Mode`). The active mode determines the way scores are computed. For example, consider the [example index from earlier](#creating-an-index). Setting the mode to `fast_forward.index.Mode.PASSAGE` instructs the index to compute scores on the passage level (and expect passage IDs in the input ranking):
+Each index has a ranking `Mode`. The active mode determines the way scores are computed. For example, consider the [example index from earlier](#creating-an-index). Setting the mode to `Mode.PASSAGE` instructs the index to compute scores on the passage level (and expect passage IDs in the input ranking):
 
 ```python
 my_index.mode = Mode.PASSAGE
 ```
 
 Similarly, the index can return document IDs, where the score of a document computes as
 
-- the highest score of its passages (`fast_forward.index.Mode.MAXP`),
-- the score of the its first passage (`fast_forward.index.Mode.FIRSTP`) or
-- the average score of all its passages (`fast_forward.index.Mode.AVEP`).
+- the highest score of its passages (`Mode.MAXP`),
+- the score of the its first passage (`Mode.FIRSTP`) or
+- the average score of all its passages (`Mode.AVEP`).
 
-## Early stopping
+### Early stopping
 
 Early stopping is a technique to limit the number of index look-ups. This can be beneficial for OnDiskIndexes, especially when the disk is slow. For early stopping, a relatively small cut-off depth (e.g., `10`) is required, and it is mostly helpful when a large number of candidates are to be re-ranked. More information can be found [in the paper](https://dl.acm.org/doi/abs/10.1145/3485447.3511955). Note that the implementation here differs slightly from the algorithm in the paper, as the early stopping criterion is only computed at depths that are specified via the `early_stopping_depths` parameter for performance reasons.
 

diff --git a/src/fast_forward/docs/indexer.md b/src/fast_forward/docs/indexer.md
@@ -1,4 +1,6 @@
-The `fast_forward.indexer.Indexer` class is a utility for indexing collections or adding pre-computed vectors to an index.
+# Indexer
+
+The `Indexer` class is a utility for indexing collections or adding pre-computed vectors to an index.
 
 If the size of the collection is known in advance, it can be specified when the index is created in order to avoid subsequent resizing operations:
 
@@ -21,7 +23,7 @@ The indexer can be created as follows:
 indexer = Indexer(my_index, doc_encoder, encoder_batch_size=8)
 ```
 
-`fast_forward.indexer.Indexer.from_dicts` consumes an iterator that yields dictionaries:
+`Indexer.from_dicts` consumes an iterator that yields dictionaries:
 
 ```python
 def docs_iter():

diff --git a/src/fast_forward/docs/main.md b/src/fast_forward/docs/main.md
@@ -4,11 +4,20 @@ This is the implementation of [Fast-Forward indexes](https://dl.acm.org/doi/abs/
 
 # Features
 
-- Efficient look-up-based computation of semantic ranking scores
-- Interpolation of lexical (retrieval) and semantic (re-ranking) scores
+- **Efficient look-up-based computation** of semantic ranking scores
+- **Interpolation** of lexical (retrieval) and semantic (re-ranking) scores
 - Passage- and document-level ranking, including MaxP, FirstP, and AverageP
-- Early stopping for limiting index look-ups
-- Index compression via sequential coalescing
+- **Early stopping** for limiting index look-ups
+- Index compression via **quantization** and **sequential coalescing**
+
+# How to...
+
+- [create and use Fast-Forward indexes?](fast_forward/index.html)
+- [index a collection?](fast_forward/util.html#indexer)
+- [use quantization to reduce index size?](fast_forward/quantizer.html)
+- [create custom encoders?](fast_forward/encoder.html#custom-encoders)
+- [read, manipulate, and save rankings?](#rankings)
+- [use Fast-Forward indexes with PyTerrier?](fast_forward/util/pyterrier.html)
 
 # Installation
 
@@ -38,7 +47,9 @@ Using a Fast-Forward index is as simple as providing a TREC run with sparse scor
 
 ```python
 from pathlib import Path
-from fast_forward import OnDiskIndex, Mode, Ranking
+
+from fast_forward import Ranking
+from fast_forward.index import OnDiskIndex, Mode
 from fast_forward.encoder import TCTColBERTQueryEncoder
 
 # choose a pre-trained query encoder
@@ -67,12 +78,3 @@ out = ff_index(first_stage_ranking)
 # interpolate scores and create a new TREC runfile
 first_stage_ranking.interpolate(out, 0.1).save(Path("/path/to/output/run.tsv"))
 ```
-
-## How to...
-
-- [create and use Fast-Forward indexes?](fast_forward/index.html)
-- [index a collection?](fast_forward/indexer.html)
-- [use quantization to reduce index size?](fast_forward/quantizer.html)
-- [create custom encoders?](fast_forward/encoder.html#custom-encoders)
-- [read, manipulate, and save rankings?](fast_forward/ranking.html)
-- [use Fast-Forward indexes with PyTerrier?](fast_forward/util.html#pyterrier-transformers)
diff --git a/src/fast_forward/docs/pyterrier.md b/src/fast_forward/docs/pyterrier.md
@@ -0,0 +1,11 @@
+# PyTerrier transformers
+
+Fast-Forward indexes can seamlessly be integrated into [PyTerrier](https://pyterrier.readthedocs.io/en/latest/) pipelines using the transformers provided in `fast_forward.util.pyterrier`. Specifically, a re-ranking pipeline might look like this, given that `my_index` is a Fast-Forward index of the MS MARCO passage corpus:
+
+```python
+bm25 = pt.BatchRetrieve.from_dataset(
+    "msmarco_passage", variant="terrier_stemmed", wmodel="BM25"
+)
+
+ff_pl = bm25 % 5000 >> FFScore(my_index) >> FFInterpolate(0.2)
+```
diff --git a/src/fast_forward/docs/quantizer.md b/src/fast_forward/docs/quantizer.md
@@ -1,7 +1,9 @@
-Fast-Forward indexes support (product) quantization as a means of compressing an index. The `fast_forward.quantizer.Quantizer` class defines the interface for quantizers to implement. Currently, the following quantizers are available:
+# Quantization
 
-- `fast_forward.quantizer.nanopq.NanoPQ`: Product quantization based on the [nanopq library](https://nanopq.readthedocs.io/en/stable/index.html).
-- `fast_forward.quantizer.nanopq.NanoOPQ`: Optimized product quantization based on the [nanopq library](https://nanopq.readthedocs.io/en/stable/index.html).
+Fast-Forward indexes support (product) quantization as a means of compressing an index. The `Quantizer` class defines the interface for quantizers to implement. Currently, the following quantizers are available:
+
+- `NanoPQ`: Product quantization based on the [nanopq library](https://nanopq.readthedocs.io/en/stable/index.html).
+- `NanoOPQ`: Optimized product quantization based on the [nanopq library](https://nanopq.readthedocs.io/en/stable/index.html).
 
 Quantizers must be trained **before** they are used with the corresponding Fast-Forward index. The typical workflow is as follows:
 
@@ -10,14 +12,14 @@ from pathlib import Path
 
 import numpy as np
 
-from fast_forward import OnDiskIndex
-from fast_forward.quantizer.nanopq import NanoPQ
+from fast_forward.index import OnDiskIndex
+from fast_forward.quantizer import NanoPQ
 
 # in practice, a subset of the encoded corpus should be used as training vectors
 training_vectors = np.random.normal(size=(2**10, 768)).astype(np.float32)
 
 quantizer = NanoPQ(M=8, Ks=256)
 quantizer.fit(training_vectors)
 
-index = OnDiskIndex(Path("index.h5"), quantizer=quantizer)
+index = OnDiskIndex(Path("my_index.h5"), quantizer=quantizer)
 ```
diff --git a/src/fast_forward/docs/ranking.md b/src/fast_forward/docs/ranking.md
@@ -1,10 +1,12 @@
-Rankings (or runs) are represented using the `fast_forward.ranking.Ranking` class. Each ranking contains query IDs, document/passage IDs, and corresponding scores. Rankings may be instantiated from TREC runfiles using `fast_forward.ranking.Ranking.from_file`:
+# Rankings
+
+Rankings (or runs) are represented using the `Ranking` class. Each ranking contains query IDs, document/passage IDs, and corresponding scores. Rankings may be instantiated from TREC runfiles using `Ranking.from_file`:
 
 ```python
 r = Ranking.from_file(Path("/path/to/TREC/run.tsv"))
 ```
 
-Alternatively, rankings can be created from nested dictionaries using `fast_forward.ranking.Ranking.from_run`:
+Alternatively, rankings can be created from nested dictionaries using `Ranking.from_run`:
 
 ```python
 run = {
@@ -14,7 +16,7 @@ run = {
 r = Ranking.from_run(run)
 ```
 
-Optionally, rankings can have queries attached using `fast_forward.ranking.Ranking.attach_queries`:
+Optionally, rankings can have queries attached using `Ranking.attach_queries`:
 
 ```python
 r = r.attach_queries(
@@ -33,7 +35,7 @@ Rankings implement addition and multiplication operators, for example:
 ranking_3 = 0.1 * ranking_1 + ranking_2
 ```
 
-`fast_forward.ranking.Ranking.interpolate` allows to interpolate the scores of two rankings (i.e., retrieval and re-ranking):
+`Ranking.interpolate` allows to interpolate the scores of two rankings (i.e., retrieval and re-ranking):
 
 ```python
 first_stage_ranking = Ranking.from_file(Path("/path/to/TREC/run.tsv"))
@@ -43,7 +45,7 @@ interpolated_ranking = first_stage_ranking.interpolate(semantic_scores, 0.1)
 interpolated_ranking = first_stage_ranking * 0.1 + semantic_scores * 0.9
 ```
 
-Additionally, `fast_forward.ranking.Ranking.rr_scores` recomputes a ranking's scores based on the reciprocal rank. This allows, for example, to perform [reciprocal rank fusion (RRF)](https://dl.acm.org/doi/10.1145/1571941.1572114) as follows:
+Additionally, `Ranking.rr_scores` recomputes a ranking's scores based on the reciprocal rank. This allows, for example, to perform [reciprocal rank fusion (RRF)](https://dl.acm.org/doi/10.1145/1571941.1572114) as follows:
 
 ```python
 rrf_ranking = first_stage_ranking.rr_scores() + semantic_scores.rr_scores()

diff --git a/src/fast_forward/docs/util.md b/src/fast_forward/docs/util.md
@@ -7,15 +7,3 @@ my_index = OnDiskIndex.load(Path("/path/to/index.h5"))
 coalesced_index = InMemoryIndex(mode=Mode.MAXP)
 create_coalesced_index(my_index, coalesced_index, 0.3)
 ```
-
-# PyTerrier transformers
-
-Fast-Forward indexes can seamlessly be integrated into [PyTerrier](https://pyterrier.readthedocs.io/en/latest/) pipelines using the transformers provided in `fast_forward.util.pyterrier`. Specifically, a re-ranking pipeline might look like this, given that `my_index` is a Fast-Forward index of the MS MARCO passage corpus:
-
-```python
-bm25 = pt.BatchRetrieve.from_dataset(
-    "msmarco_passage", variant="terrier_stemmed", wmodel="BM25"
-)
-
-ff_pl = bm25 % 5000 >> FFScore(my_index) >> FFInterpolate(0.2)
-```
diff --git a/src/fast_forward/encoder/__init__.py b/src/fast_forward/encoder/__init__.py
@@ -0,0 +1,38 @@
+""".. include:: ../docs/encoder.md"""  # noqa: D400, D415
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from fast_forward.encoder.base import Encoder
+from fast_forward.encoder.transformer import (
+    TCTColBERTDocumentEncoder,
+    TCTColBERTQueryEncoder,
+    TransformerEncoder,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Sequence
+
+__all__ = [
+    "Encoder",
+    "LambdaEncoder",
+    "TransformerEncoder",
+    "TCTColBERTQueryEncoder",
+    "TCTColBERTDocumentEncoder",
+]
+
+
+class LambdaEncoder(Encoder):
+    """Encoder adapter class for arbitrary encoding functions."""
+
+    def __init__(self, f: "Callable[[str], np.ndarray]") -> None:
+        """Create a lambda encoder.
+
+        :param f: Function to encode a single piece of text.
+        """
+        super().__init__()
+        self._f = f
+
+    def _encode(self, texts: "Sequence[str]") -> np.ndarray:
+        return np.array(list(map(self._f, texts)))
diff --git a/src/fast_forward/encoder/base.py b/src/fast_forward/encoder/base.py
@@ -0,0 +1,23 @@
+import abc
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    import numpy as np
+
+
+class Encoder(abc.ABC):
+    """Base class for encoders."""
+
+    @abc.abstractmethod
+    def _encode(self, texts: "Sequence[str]") -> "np.ndarray":
+        pass
+
+    def __call__(self, texts: "Sequence[str]") -> "np.ndarray":
+        """Encode a list of texts.
+
+        :param texts: The texts to encode.
+        :return: The resulting vector representations.
+        """
+        return self._encode(texts)