From d76bb4caa7059f0a613836d812e494f383bf128a Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Sat, 4 Nov 2023 15:36:11 -0400 Subject: [PATCH] Add regressions for OpenAI-ada2 embeddings on MS MARCO passage (#2235) With Lucene 9.8 upgrade, the 1536d embeddings work now! --- README.md | 4 +- ...experiments-msmarco-passage-openai-ada2.md | 2 +- .../regressions-dl19-passage-openai-ada2.md | 119 ++++++++++++++++++ .../regressions-dl20-passage-openai-ada2.md | 119 ++++++++++++++++++ ...regressions-msmarco-passage-openai-ada2.md | 112 +++++++++++++++++ .../anserini/index/IndexHnswDenseVectors.java | 39 +++++- .../LuceneDenseVectorDocumentGenerator.java | 11 +- .../dl19-passage-cos-dpr-distil.template | 6 +- .../dl19-passage-openai-ada2.template | 97 ++++++++++++++ .../dl20-passage-cos-dpr-distil.template | 6 +- .../dl20-passage-openai-ada2.template | 97 ++++++++++++++ .../msmarco-passage-openai-ada2.template | 90 +++++++++++++ .../regression/dl19-passage-openai-ada2.yaml | 63 ++++++++++ .../regression/dl20-passage-openai-ada2.yaml | 63 ++++++++++ .../msmarco-passage-openai-ada2.yaml | 63 ++++++++++ 15 files changed, 877 insertions(+), 14 deletions(-) create mode 100644 docs/regressions/regressions-dl19-passage-openai-ada2.md create mode 100644 docs/regressions/regressions-dl20-passage-openai-ada2.md create mode 100644 docs/regressions/regressions-msmarco-passage-openai-ada2.md create mode 100644 src/main/resources/docgen/templates/dl19-passage-openai-ada2.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-openai-ada2.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template create mode 100644 src/main/resources/regression/dl19-passage-openai-ada2.yaml create mode 100644 src/main/resources/regression/dl20-passage-openai-ada2.yaml create mode 100644 src/main/resources/regression/msmarco-passage-openai-ada2.yaml diff --git a/README.md b/README.md index 4c7919d98..d9e970ab6 100644 --- a/README.md +++ b/README.md @@ -86,7 +86,8 @@ See individual pages for details! | SPLADE++ CoCondenser-SelfDistil | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd.md) | | SPLADE++ CoCondenser-SelfDistil (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd-onnx.md) | | **Learned Dense** | | | | -| cosDPR-distil | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil.md) | +| cosDPR-distil | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil.md) | +| OpenAI-ada2 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) | ### Available Corpora for Download @@ -102,6 +103,7 @@ See individual pages for details! | [SPLADE++ CoCondenser-EnsembleDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar) | 4.2 GB | `e489133bdc54ee1e7c62a32aa582bc77` | | [SPLADE++ CoCondenser-SelfDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar) | 4.8 GB | `cb7e264222f2bf2221dd2c9d28190be1` | | [cosDPR-distil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar) | 57 GB | `e20ffbc8b5e7f760af31298aefeaebbd` | +| [OpenAI-ada2](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar) | 109 GB | `a4d843d522ff3a3af7edbee789a63402` |
diff --git a/docs/experiments-msmarco-passage-openai-ada2.md b/docs/experiments-msmarco-passage-openai-ada2.md index 01b1b42fa..dacf86168 100644 --- a/docs/experiments-msmarco-passage-openai-ada2.md +++ b/docs/experiments-msmarco-passage-openai-ada2.md @@ -1,4 +1,4 @@ -# Anserini: OpenAI-ada2 Embeddings for MS MARCO Passage Ranking +# Anserini: OpenAI-ada2 Embeddings for MS MARCO Passage This guide explains how to reproduce experiments with OpenAI-ada2 emebddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). In these experiments, we are using pre-encoded queries (i.e., cached results of query embeddings). diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2.md b/docs/regressions/regressions-dl19-passage-openai-ada2.md new file mode 100644 index 000000000..ed4950e27 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-openai-ada2.md @@ -0,0 +1,119 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-openai-ada2.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-openai-ada2 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ + -generator LuceneDenseVectorDocumentGenerator \ + -threads 16 -M 16 -efC 100 -memorybuffer 65536 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt \ + -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.704 | +| **R@100** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.624 | +| **R@1000** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.857 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2.md b/docs/regressions/regressions-dl20-passage-openai-ada2.md new file mode 100644 index 000000000..cefd54f66 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-openai-ada2.md @@ -0,0 +1,119 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-openai-ada2.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-openai-ada2 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ + -generator LuceneDenseVectorDocumentGenerator \ + -threads 16 -M 16 -efC 100 -memorybuffer 65536 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl20-passage.openai-ada2.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt \ + -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.477 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 | +| **R@100** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.723 | +| **R@1000** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.867 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2.md b/docs/regressions/regressions-msmarco-passage-openai-ada2.md new file mode 100644 index 000000000..ab28ff703 --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-openai-ada2.md @@ -0,0 +1,112 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-openai-ada2.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-openai-ada2 +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 \ + --corpus-path collections/msmarco-passage-openai-ada2 +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +target/appassembler/bin/IndexHnswDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2 \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ + -generator LuceneDenseVectorDocumentGenerator \ + -threads 16 -M 16 -efC 100 -memorybuffer 65536 \ + >& logs/log.msmarco-passage-openai-ada2 & +``` + +The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.898 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.985 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index e92835448..38d7525dc 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -30,12 +30,16 @@ import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.core.config.Configurator; import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -76,8 +80,7 @@ public static final class Args { public static final String VECTOR = "vector"; private static final int TIMEOUT = 600 * 1000; - - + // required arguments @Option(name = "-M", metaVar = "[num]", required = true, usage = "HNSW parameters M") @@ -374,6 +377,35 @@ public IndexHnswDenseVectors(Args args) throws Exception { this.counters = new Counters(); } + // Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html + // This class exists because Lucene95HnswVectorsFormat's getMaxDimensions method is final and we + // need to workaround that constraint to allow more than the default number of dimensions + private static final class OpenAiDelegatingKnnVectorsFormat extends KnnVectorsFormat { + private final KnnVectorsFormat delegate; + private final int maxDimensions; + + public OpenAiDelegatingKnnVectorsFormat(KnnVectorsFormat delegate, int maxDimensions) { + super(delegate.getName()); + this.delegate = delegate; + this.maxDimensions = maxDimensions; + } + + @Override + public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException { + return delegate.fieldsWriter(state); + } + + @Override + public KnnVectorsReader fieldsReader(SegmentReadState state) throws IOException { + return delegate.fieldsReader(state); + } + + @Override + public int getMaxDimensions(String fieldName) { + return maxDimensions; + } + } + public Counters run() throws IOException { final long start = System.nanoTime(); LOG.info("============ Indexing Collection ============"); @@ -387,7 +419,8 @@ public Counters run() throws IOException { final IndexWriterConfig config = new IndexWriterConfig().setCodec(new Lucene95Codec(){ @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { - return new Lucene95HnswVectorsFormat(args.M, args.efC); + return new OpenAiDelegatingKnnVectorsFormat( + new Lucene95HnswVectorsFormat(args.M, args.efC), 4096); } }); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); diff --git a/src/main/java/io/anserini/index/generator/LuceneDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/LuceneDenseVectorDocumentGenerator.java index caad4b1d8..38a7177fb 100644 --- a/src/main/java/io/anserini/index/generator/LuceneDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/LuceneDenseVectorDocumentGenerator.java @@ -21,13 +21,16 @@ import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import io.anserini.collection.SourceDocument; +import io.anserini.index.Constants; import io.anserini.index.IndexHnswDenseVectors; +import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.KnnVectorField; +import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.util.BytesRef; import java.util.ArrayList; @@ -78,9 +81,11 @@ public Document createDocument(T src) throws InvalidDocumentException { final Document document = new Document(); // Store the collection docid. - document.add(new StringField(IndexHnswDenseVectors.Args.ID, id, Field.Store.YES)); + document.add(new StringField(Constants.ID, id, Field.Store.YES)); // This is needed to break score ties by docid. - document.add(new KnnVectorField(IndexHnswDenseVectors.Args.VECTOR, contents, VectorSimilarityFunction.DOT_PRODUCT)); + document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id))); + + document.add(new KnnFloatVectorField(IndexHnswDenseVectors.Args.VECTOR, contents, VectorSimilarityFunction.DOT_PRODUCT)); if (args.storeRaw) { document.add(new StoredField(IndexHnswDenseVectors.Args.RAW, src.raw())); } diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template index e389548c1..3d24eeb70 100644 --- a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ -tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template new file mode 100644 index 000000000..ad66bcf64 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template @@ -0,0 +1,97 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template index 1d6c2e552..50d46f535 100644 --- a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template @@ -35,11 +35,11 @@ The `run_regression.py` script automates the following steps, but if you want to Download the corpus and unpack into `collections/`: ```bash -wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ -tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ ``` -To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. With the corpus downloaded, the following command will perform the remaining steps below: ```bash diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template new file mode 100644 index 000000000..1549ee91a --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template @@ -0,0 +1,97 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template new file mode 100644 index 000000000..66deba42f --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template @@ -0,0 +1,90 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + + + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally stable to the third digit after the decimal point. + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/regression/dl19-passage-openai-ada2.yaml b/src/main/resources/regression/dl19-passage-openai-ada2.yaml new file mode 100644 index 000000000..cf63f0276 --- /dev/null +++ b/src/main/resources/regression/dl19-passage-openai-ada2.yaml @@ -0,0 +1,63 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2/ +collection_class: JsonDenseVectorCollection +generator_class: LuceneDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memorybuffer 65536 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.openai-ada2.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.479 + nDCG@10: + - 0.704 + R@100: + - 0.624 + R@1000: + - 0.857 diff --git a/src/main/resources/regression/dl20-passage-openai-ada2.yaml b/src/main/resources/regression/dl20-passage-openai-ada2.yaml new file mode 100644 index 000000000..97c696f79 --- /dev/null +++ b/src/main/resources/regression/dl20-passage-openai-ada2.yaml @@ -0,0 +1,63 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2/ +collection_class: JsonDenseVectorCollection +generator_class: LuceneDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memorybuffer 65536 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20-passage.openai-ada2.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.477 + nDCG@10: + - 0.676 + R@100: + - 0.723 + R@1000: + - 0.867 diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml new file mode 100644 index 000000000..77549dc6b --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml @@ -0,0 +1,63 @@ +--- +corpus: msmarco-passage-openai-ada2 +corpus_path: collections/msmarco/msmarco-passage-openai-ada2/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar +download_checksum: a4d843d522ff3a3af7edbee789a63402 + +index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2/ +collection_class: JsonDenseVectorCollection +generator_class: LuceneDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -memorybuffer 65536 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: openai-ada2 + display: OpenAI-ada2 + params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 + results: + AP@1000: + - 0.350 + RR@10: + - 0.343 + R@100: + - 0.898 + R@1000: + - 0.985