diff --git a/README.md b/README.md index 0b1601d2d7..d89308f7d3 100644 --- a/README.md +++ b/README.md @@ -109,26 +109,44 @@ See individual pages for details. ### MS MARCO V1 Passage Regressions +| | dev | DL19 | DL20 | +|--------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| +| **Unsupervised Sparse** | | | | +| Lucene BoW baselines | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.md) | +| Quantized BM25 | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.bm25-b8.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.bm25-b8.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.bm25-b8.md) | +| WordPiece baselines (pre-tokenized) | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.wp-tok.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.wp-tok.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.wp-tok.md) | +| WordPiece baselines (Huggingface) | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.wp-hgf.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.wp-hgf.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.wp-hgf.md) | +| WordPiece + Lucene BoW baselines | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.wp-ca.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.wp-ca.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.wp-ca.md) | +| doc2query | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.doc2query.md) | | | +| doc2query-T5 | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.docTTTTTquery.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.docTTTTTquery.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.docTTTTTquery.md) | +| **Learned Sparse (uniCOIL family)** | | | | +| uniCOIL noexp | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.unicoil-noexp.cached.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.unicoil-noexp.cached.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.unicoil-noexp.cached.md) | +| uniCOIL with doc2query-T5 | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.unicoil.cached.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.unicoil.cached.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.unicoil.cached.md) | +| uniCOIL with TILDE | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.unicoil-tilde-expansion.cached.md) | | | +| **Learned Sparse (other)** | | | | +| DeepImpact | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.deepimpact.cached.md) | | | +| SPLADEv2 | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.distill-splade-max.cached.md) | | | +| SPLADE++ CoCondenser-EnsembleDistil | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.splade-pp-ed.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.splade-pp-ed.onnx.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.splade-pp-ed.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.splade-pp-ed.onnx.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.splade-pp-ed.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.splade-pp-ed.onnx.md) | +| SPLADE++ CoCondenser-SelfDistil | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.splade-pp-sd.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.splade-pp-sd.onnx.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.splade-pp-sd.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.splade-pp-sd.onnx.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.splade-pp-sd.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.splade-pp-sd.onnx.md) | +| **Learned Dense** (HNSW indexes) | | | | +| cosDPR-distil | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md) | +| BGE-base-en-v1.5 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md) | +| OpenAI Ada2 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-int8.cached.md) | +| Cohere English v3.0 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md) | +| **Learned Dense** (flat indexes) | | | | +| cosDPR-distil | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.md) | +| BGE-base-en-v1.5 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md) | +| OpenAI Ada2 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat.cached.md) int8:[πŸ«™οΈ](docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat-int8.cached.md) | +| Cohere English v3.0 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md) | +| **Learned Dense** (Inverted; experimental) | | | | +| cosDPR-distil w/ "fake words" | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.fw.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.fw.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.fw.md) | +| cosDPR-distil w/ "LexLSH" | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.lexlsh.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.lexlsh.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.lexlsh.md) | + +
+Deprecated instructions for learned dense models using corpora in jsonl format | | dev | DL19 | DL20 | |--------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:| -| **Unsupervised Sparse** | | | | -| Lucene BoW baselines | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.md) | -| Quantized BM25 | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.bm25-b8.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.bm25-b8.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.bm25-b8.md) | -| WordPiece baselines (pre-tokenized) | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.wp-tok.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.wp-tok.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.wp-tok.md) | -| WordPiece baselines (Huggingface) | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.wp-hgf.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.wp-hgf.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.wp-hgf.md) | -| WordPiece + Lucene BoW baselines | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.wp-ca.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.wp-ca.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.wp-ca.md) | -| doc2query | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.doc2query.md) | | | -| doc2query-T5 | [πŸ”‘](docs/regressions/regressions-msmarco-v1-passage.docTTTTTquery.md) | [πŸ”‘](docs/regressions/regressions-dl19-passage.docTTTTTquery.md) | [πŸ”‘](docs/regressions/regressions-dl20-passage.docTTTTTquery.md) | -| **Learned Sparse (uniCOIL family)** | | | | -| uniCOIL noexp | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.unicoil-noexp.cached.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.unicoil-noexp.cached.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.unicoil-noexp.cached.md) | -| uniCOIL with doc2query-T5 | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.unicoil.cached.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.unicoil.cached.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.unicoil.cached.md) | -| uniCOIL with TILDE | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.unicoil-tilde-expansion.cached.md) | | | -| **Learned Sparse (other)** | | | | -| DeepImpact | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.deepimpact.cached.md) | | | -| SPLADEv2 | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.distill-splade-max.cached.md) | | | -| SPLADE++ CoCondenser-EnsembleDistil | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.splade-pp-ed.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.splade-pp-ed.onnx.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.splade-pp-ed.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.splade-pp-ed.onnx.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.splade-pp-ed.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.splade-pp-ed.onnx.md) | -| SPLADE++ CoCondenser-SelfDistil | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.splade-pp-sd.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.splade-pp-sd.onnx.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.splade-pp-sd.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.splade-pp-sd.onnx.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.splade-pp-sd.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.splade-pp-sd.onnx.md) | | **Learned Dense** (HNSW indexes) | | | | | cosDPR-distil | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md) | | BGE-base-en-v1.5 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md) | @@ -139,9 +157,8 @@ See individual pages for details. | BGE-base-en-v1.5 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.onnx.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.onnx.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.cached.md)[πŸ…ΎοΈ](docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.onnx.md) | | OpenAI Ada2 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.openai-ada2.flat.cached.md) int8:[πŸ«™οΈ](docs/regressions/regressions-msmarco-v1-passage.openai-ada2.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.openai-ada2.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.openai-ada2.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.openai-ada2.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.openai-ada2.flat-int8.cached.md) | | Cohere English v3.0 | full:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.flat-int8.cached.md) | full:[πŸ«™](docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.flat.cached.md) int8:[πŸ«™](docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.flat-int8.cached.md) | -| **Learned Dense** (Inverted; experimental) | | | | -| cosDPR-distil w/ "fake words" | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.fw.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.fw.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.fw.md) | -| cosDPR-distil w/ "LexLSH" | [πŸ«™](docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.lexlsh.md) | [πŸ«™](docs/regressions/regressions-dl19-passage.cos-dpr-distil.lexlsh.md) | [πŸ«™](docs/regressions/regressions-dl20-passage.cos-dpr-distil.lexlsh.md) | + +
Key: + πŸ”‘ = keyword queries @@ -151,20 +168,34 @@ Key: ### Available Corpora for Download -| Corpora | Size | Checksum | -|:---------------------------------------------------------------------------------------------------------------------|-------:|:-----------------------------------| -| [Quantized BM25](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-bm25-b8.tar) | 1.2 GB | `0a623e2c97ac6b7e814bf1323a97b435` | -| [uniCOIL (noexp)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar) | 2.7 GB | `f17ddd8c7c00ff121c3c3b147d2e17d8` | -| [uniCOIL (d2q-T5)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar) | 3.4 GB | `78eef752c78c8691f7d61600ceed306f` | -| [uniCOIL (TILDE)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-tilde-expansion.tar) | 3.9 GB | `12a9c289d94e32fd63a7d39c9677d75c` | -| [DeepImpact](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-deepimpact.tar) | 3.6 GB | `73843885b503af3c8b3ee62e5f5a9900` | -| [SPLADEv2](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-distill-splade-max.tar) | 9.9 GB | `b5d126f5d9a8e1b3ef3f5cb0ba651725` | -| [SPLADE++ CoCondenser-EnsembleDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar) | 4.2 GB | `e489133bdc54ee1e7c62a32aa582bc77` | -| [SPLADE++ CoCondenser-SelfDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar) | 4.8 GB | `cb7e264222f2bf2221dd2c9d28190be1` | -| [cosDPR-distil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar) | 57 GB | `e20ffbc8b5e7f760af31298aefeaebbd` | -| [BGE-base-en-v1.5](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.tar) | 59 GB | `353d2c9e72e858897ad479cca4ea0db1` | -| [OpenAI-ada2](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar) | 109 GB | `a4d843d522ff3a3af7edbee789a63402` | -| [Cohere embed-english-v3.0](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.tar) | 38 GB | `06a6e38a0522850c6aa504db7b2617f5` | +| Corpora | Size | Checksum | +|:-----------------------------------------------------------------------------------------------------------------------------------------|-------:|:-----------------------------------| +| [Quantized BM25](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-bm25-b8.tar) | 1.2 GB | `0a623e2c97ac6b7e814bf1323a97b435` | +| [uniCOIL (noexp)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-noexp.tar) | 2.7 GB | `f17ddd8c7c00ff121c3c3b147d2e17d8` | +| [uniCOIL (d2q-T5)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil.tar) | 3.4 GB | `78eef752c78c8691f7d61600ceed306f` | +| [uniCOIL (TILDE)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-unicoil-tilde-expansion.tar) | 3.9 GB | `12a9c289d94e32fd63a7d39c9677d75c` | +| [DeepImpact](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-deepimpact.tar) | 3.6 GB | `73843885b503af3c8b3ee62e5f5a9900` | +| [SPLADEv2](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-passage-distill-splade-max.tar) | 9.9 GB | `b5d126f5d9a8e1b3ef3f5cb0ba651725` | +| [SPLADE++ CoCondenser-EnsembleDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-ed.tar) | 4.2 GB | `e489133bdc54ee1e7c62a32aa582bc77` | +| [SPLADE++ CoCondenser-SelfDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-splade-pp-sd.tar) | 4.8 GB | `cb7e264222f2bf2221dd2c9d28190be1` | +| [cosDPR-distil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar) (parquet) | 38 GB | `c8a204fbc3ccda581aa375936af43a97` | +| [BGE-base-en-v1.5](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar) (parquet) | 39 GB | `b235e19ec492c18a18057b30b8b23fd4` | +| [OpenAI-ada2](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar) (parquet) | 75 GB | `fa3637e9c4150b157270e19ef3a4f779` | +| [Cohere embed-english-v3.0](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar) (parquet) | 16 GB | `40c5caf33476746e93ceeb75174b8d64` | + +
+Deprecated corpora for learned dense models using corpora in jsonl format + +| Corpora | Size | Checksum | +|:-----------------------------------------------------------------------------------------------------------------------------------------|-------:|:-----------------------------------| +| [cosDPR-distil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar) (jsonl, deprecated) | 57 GB | `e20ffbc8b5e7f760af31298aefeaebbd` | +| [BGE-base-en-v1.5](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.tar) (jsonl, deprecated) | 59 GB | `353d2c9e72e858897ad479cca4ea0db1` | +| [OpenAI-ada2](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar) (jsonl, deprecated) | 109 GB | `a4d843d522ff3a3af7edbee789a63402` | +| [Cohere embed-english-v3.0](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.tar) (jsonl, deprecated) | 38 GB | `06a6e38a0522850c6aa504db7b2617f5` | + +
+ +
@@ -196,6 +227,8 @@ Key: | [MS MARCO V1 doc: uniCOIL (noexp)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil-noexp.tar) | 11 GB | `11b226e1cacd9c8ae0a660fd14cdd710` | | [MS MARCO V1 doc: uniCOIL (d2q-T5)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco-doc-segmented-unicoil.tar) | 19 GB | `6a00e2c0c375cb1e52c83ae5ac377ebb` | +
+
MS MARCO V2 Passage Regressions @@ -227,6 +260,8 @@ Key: | [SPLADE++ CoCondenser-EnsembleDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_ed.tar) | 66 GB | `2cdb2adc259b8fa6caf666b20ebdc0e8` | | [SPLADE++ CoCondenser-SelfDistil](https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco_v2_passage_splade_pp_sd.tar) | 76 GB | `061930dd615c7c807323ea7fc7957877` | +
+
MS MARCO V2 Document Regressions @@ -252,6 +287,8 @@ Key: | [MS MARCO V2 doc: uniCOIL (noexp)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_noexp_0shot_v2.tar) | 55 GB | `97ba262c497164de1054f357caea0c63` | | [MS MARCO V2 doc: uniCOIL (d2q-T5)](https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/data/msmarco_v2_doc_segmented_unicoil_0shot_v2.tar) | 72 GB | `c5639748c2cbad0152e10b0ebde3b804` | +
+
MS MARCO V2.1 Document Regressions @@ -268,6 +305,8 @@ The experiments below capture topics and qrels originally targeted at the V2 cor | **Unsupervised Lexical, Segmented Doc** | | | | | | | baselines | [+](docs/regressions/regressions-msmarco-v2.1-doc-segmented.md) | [+](docs/regressions/regressions-dl21-doc-segmented-msmarco-v2.1.md) | [+](docs/regressions/regressions-dl22-doc-segmented-msmarco-v2.1.md) | [+](docs/regressions/regressions-dl23-doc-segmented-msmarco-v2.1.md) | [+](docs/regressions/regressions-rag24-doc-segmented-raggy-dev.md) | +
+
BEIR (v1.0.0) Regressions @@ -406,6 +445,8 @@ Substitute the appropriate `$MODEL` from the table below. | BGE (HNSW, full; ONNX) | `bge-base-en-v1.5.parquet.hnsw.onnx` | | BGE (HNSW, int8; ONNX) | `bge-base-en-v1.5.parquet.hnsw-int8.onnx` | +
+
Cross-lingual and Multi-lingual Regressions @@ -428,6 +469,8 @@ Substitute the appropriate `$MODEL` from the table below. + Regressions for CIRAL (v1.0) BM25 (query translation): [Hausa](docs/regressions/regressions-ciral-v1.0-ha.md), [Somali](docs/regressions/regressions-ciral-v1.0-so.md), [Swahili](docs/regressions/regressions-ciral-v1.0-sw.md), [Yoruba](docs/regressions/regressions-ciral-v1.0-yo.md) + Regressions for CIRAL (v1.0) BM25 (document translation): [Hausa](docs/regressions/regressions-ciral-v1.0-ha-en.md), [Somali](docs/regressions/regressions-ciral-v1.0-so-en.md), [Swahili](docs/regressions/regressions-ciral-v1.0-sw-en.md), [Yoruba](docs/regressions/regressions-ciral-v1.0-yo-en.md) +
+
Other Regressions @@ -444,6 +487,8 @@ Substitute the appropriate `$MODEL` from the table below. + Regressions for [FEVER Fact Verification](docs/regressions/regressions-fever.md) + Regressions for DPR Wikipedia QA baselines: [100-word splits](docs/regressions/regressions-wikipedia-dpr-100w-bm25.md), [6/3 sliding window sentences](docs/regressions/regressions-wiki-all-6-3-tamber-bm25.md) +
+
## πŸ“ƒ Additional Documentation diff --git a/bin/run.sh b/bin/run.sh index 43bb461cc4..5edfa6c7c4 100755 --- a/bin/run.sh +++ b/bin/run.sh @@ -1,3 +1,3 @@ #!/bin/sh -java -cp `ls target/*-fatjar.jar` -Xms512M -Xmx64G --add-modules jdk.incubator.vector $@ \ No newline at end of file +java -cp `ls target/*-fatjar.jar` -Xms512M -Xmx192G --add-modules jdk.incubator.vector $@ diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.cached.md index cdb2df7473..c7dd567229 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.flat-int8.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.onnx.md index 991f94b8f2..ebda2e35ab 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat-int8.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.flat-int8.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.cached.md index 5e66d47108..08fd9ce64e 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.flat.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.onnx.md index 46f12c2674..0d15144391 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.flat.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.flat.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md index f8932ee610..69b83383fb 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.hnsw-int8.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md index dfe6b2ec9b..bf8d671cf7 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md index 895d4d7c52..d7313e5c85 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.hnsw.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md index 103a8f0570..69e7512f78 100644 --- a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.hnsw.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.hnsw.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..559e9c7337 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4435 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7065 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6171 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8472 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md new file mode 100644 index 0000000000..b82e282cc2 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl19-passage.txt \ + -encoder BgeBaseEn15 -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4435 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7065 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6171 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8472 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.cached.md new file mode 100644 index 0000000000..f18fb95446 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.cached.md @@ -0,0 +1,115 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4435 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7065 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6171 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8472 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.md new file mode 100644 index 0000000000..1bd14ef0f2 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.md @@ -0,0 +1,116 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.flat.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl19-passage.txt \ + -encoder BgeBaseEn15 -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4435 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7065 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6171 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8472 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..befd00b70c --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md new file mode 100644 index 0000000000..e93e379145 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md new file mode 100644 index 0000000000..173bb3f236 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl19-passage.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md new file mode 100644 index 0000000000..547e7fd65b --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl19-passage.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.444 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.706 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.847 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md index 31100672d6..498b550217 100644 --- a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md @@ -60,7 +60,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cohere-embed-english-v3.0/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md index 1d3329f890..aa3447b766 100644 --- a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.hnsw.cached.md @@ -60,7 +60,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cohere-embed-english-v3.0/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..10efe5827d --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md @@ -0,0 +1,112 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4884 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6956 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6484 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8630 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.md new file mode 100644 index 0000000000..1848bdd76e --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.md @@ -0,0 +1,110 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4884 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6956 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6484 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8630 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..e22067cb9c --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md @@ -0,0 +1,113 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.696 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.648 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md new file mode 100644 index 0000000000..dc7e7d12a6 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md @@ -0,0 +1,113 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl19-passage.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.696 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.648 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md index 8c52a35688..2d24c09ff0 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.cached.md @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md index 2d3c1a32d8..bd29d14b0d 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw-int8.onnx.md @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md index ab1fbc8299..52f157c7f7 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.cached.md @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md index 9db00f428e..336bf65e28 100644 --- a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.hnsw.onnx.md @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..77c11051cb --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4656 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7250 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6173 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8201 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.md new file mode 100644 index 0000000000..2bf755e884 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.md @@ -0,0 +1,119 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl19-passage.txt \ + -encoder CosDprDistil -hits 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4656 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7250 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6173 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8201 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.cached.md new file mode 100644 index 0000000000..6d96a5925d --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.cached.md @@ -0,0 +1,115 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4656 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7250 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6173 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8201 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.onnx.md new file mode 100644 index 0000000000..cff934439c --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.flat.onnx.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.flat.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl19-passage.txt \ + -encoder CosDprDistil -hits 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4656 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7250 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6173 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8201 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..77a2dcf011 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md new file mode 100644 index 0000000000..3610cceb0a --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md @@ -0,0 +1,120 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md new file mode 100644 index 0000000000..0ddb63a4c2 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md new file mode 100644 index 0000000000..ac7b584b70 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.md @@ -0,0 +1,120 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.cos-dpr-distil.parquet.hnsw.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl19-passage.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl19-passage.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl19-passage.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl19-passage.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.466 | +| **nDCG@10** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | +| **R@100** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.617 | +| **R@1000** | **cosDPR-distil**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.820 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md index 7e2a9d13c9..3e81ea7a6f 100644 --- a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw-int8.cached.md @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md index 6e13736684..1b8f654896 100644 --- a/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.hnsw.cached.md @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..e469a120ee --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat-int8.cached.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.openai-ada2.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4788 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7035 | +| **R@100** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6235 | +| **R@1000** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8629 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat.cached.md new file mode 100644 index 0000000000..e95fbd71a7 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.flat.cached.md @@ -0,0 +1,115 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.openai-ada2.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl19-passage.openai-ada2.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl19-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4788 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7035 | +| **R@100** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6235 | +| **R@1000** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8629 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..300fbcfc57 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw-int8.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl19-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.703 | +| **R@100** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.623 | +| **R@1000** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md new file mode 100644 index 0000000000..3eb678a1f4 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage.openai-ada2.parquet.hnsw.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage.openai-ada2.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl19-passage.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl19-passage.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl19-passage.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.703 | +| **R@100** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.623 | +| **R@1000** | **OpenAI-ada2**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.863 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.cached.md index 8ab2c0abec..349abef6b4 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.flat-int8.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.onnx.md index e395e83058..c48b5919bd 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat-int8.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.flat-int8.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.cached.md index 1dae4ced64..837b59cd31 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.flat.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.onnx.md index a1a750976d..6fba001d28 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.flat.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.flat.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md index 2ae1b61592..516e3c6c26 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.hnsw-int8.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md index 1a41467d6a..d86d2175cb 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md index e4c98a18b6..ae3c3a9073 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.cached.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.hnsw.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md index f449c76ba2..8cf3c0ba5a 100644 --- a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.hnsw.onnx.md @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.hnsw.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..a594a3833b --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4650 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6780 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7169 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8503 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md new file mode 100644 index 0000000000..5a0c0890f9 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl20.txt \ + -encoder BgeBaseEn15 -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4650 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6780 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7169 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8503 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.cached.md new file mode 100644 index 0000000000..bdcd9d5509 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.cached.md @@ -0,0 +1,115 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4650 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6780 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7169 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8503 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.md new file mode 100644 index 0000000000..0e909d8c72 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.md @@ -0,0 +1,116 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.flat.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl20.txt \ + -encoder BgeBaseEn15 -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4650 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6780 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7169 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8503 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..21d3c5fbf3 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md new file mode 100644 index 0000000000..1497f522be --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md new file mode 100644 index 0000000000..968167b8c8 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.dl20.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md new file mode 100644 index 0000000000..e94dc71ee8 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl20.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.465 | +| **nDCG@10** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.678 | +| **R@100** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 | +| **R@1000** | **BGE-base-en-v1.5**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.850 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md index 7343b9ad2b..30e33d7ba4 100644 --- a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md @@ -60,7 +60,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cohere-embed-english-v3.0/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md index 3051574925..b8294b8e5f 100644 --- a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.hnsw.cached.md @@ -60,7 +60,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cohere-embed-english-v3.0/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..82ee8403a1 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md @@ -0,0 +1,112 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl20.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.5067 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7245 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7279 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8682 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.md new file mode 100644 index 0000000000..86b89a079a --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.md @@ -0,0 +1,110 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl20.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.5067 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7245 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7279 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8682 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..786c4976f6 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md @@ -0,0 +1,113 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl20.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.507 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.728 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.868 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md new file mode 100644 index 0000000000..20b2e2dfcf --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md @@ -0,0 +1,113 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.dl20.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.dl20.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.507 | +| **nDCG@10** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.725 | +| **R@100** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.728 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.868 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md index 74deab191c..990f175467 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.cached.md @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md index 2b5444244e..69244961ff 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw-int8.onnx.md @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md index 2da624aef9..ea5604003e 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.cached.md @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md index 96eea55ea2..dd93e4d37e 100644 --- a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.hnsw.onnx.md @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..06120623ed --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4876 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7025 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7204 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8533 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.md new file mode 100644 index 0000000000..21f486b6ae --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.md @@ -0,0 +1,119 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl20.txt \ + -encoder CosDprDistil -hits 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4876 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7025 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7204 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8533 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.cached.md new file mode 100644 index 0000000000..4ed64751e4 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.cached.md @@ -0,0 +1,115 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl20.cos-dpr-distil.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4876 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7025 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7204 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8533 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.onnx.md new file mode 100644 index 0000000000..e2ec83ee23 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.flat.onnx.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.flat.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl20.txt \ + -encoder CosDprDistil -hits 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4876 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7025 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7204 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8533 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..50873ff717 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md new file mode 100644 index 0000000000..f5d8695600 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md @@ -0,0 +1,120 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md new file mode 100644 index 0000000000..4bc6d86cb9 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl20.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl20.cos-dpr-distil.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md new file mode 100644 index 0000000000..42ea4f4fab --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.md @@ -0,0 +1,120 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.cos-dpr-distil.parquet.hnsw.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.dl20.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl20.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl20.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl20.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl20.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.dl20.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.488 | +| **nDCG@10** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.702 | +| **R@100** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.720 | +| **R@1000** | **cosDPR-distil**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.853 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md index 3ee98985b3..2392d1524e 100644 --- a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw-int8.cached.md @@ -65,9 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md index 4257f91bdb..2084ad0f2a 100644 --- a/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.hnsw.cached.md @@ -65,7 +65,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat-int8.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..88613cab9e --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat-int8.cached.md @@ -0,0 +1,117 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.openai-ada2.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl20.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl20.openai-ada2.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.dl20.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4771 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6759 | +| **R@100** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7237 | +| **R@1000** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8705 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat.cached.md new file mode 100644 index 0000000000..a4dd6b3363 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.flat.cached.md @@ -0,0 +1,115 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.openai-ada2.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl20.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl20.openai-ada2.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.dl20.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4771 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6759 | +| **R@100** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7237 | +| **R@1000** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8705 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..8cbe9f82a4 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw-int8.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl20.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.dl20.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.477 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 | +| **R@100** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.724 | +| **R@1000** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.871 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md new file mode 100644 index 0000000000..6ef8e09800 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage.openai-ada2.parquet.hnsw.cached.md @@ -0,0 +1,118 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage.openai-ada2.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.dl20.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl20.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl20.openai-ada2.jsonl.txt +bin/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.dl20.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.477 | +| **nDCG@10** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 | +| **R@100** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.724 | +| **R@1000** | **OpenAI-ada2**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.871 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.md index f60ba513b5..7266ed1e32 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.md index 8321695b85..f5fe02950f 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.cached.md index ea88359b6a..48a6af17b5 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.cached.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.flat.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.md index 7a0a2b2a36..4d283d13e6 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.flat.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md index f91f6d50c4..6c8fcd20a1 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -62,9 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md index 3b85d708f6..2bdfc8bba8 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -62,9 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md index 7936bda149..6e3ede46db 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -62,7 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md index 72b26fe42b..6854478d64 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.md @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -62,7 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-bge-base-en-v1.5/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..f3701a02a2 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.md @@ -0,0 +1,105 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3641 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3583 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9006 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9811 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md new file mode 100644 index 0000000000..d27cb3b4e8 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.md @@ -0,0 +1,105 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder BgeBaseEn15 -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3641 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3583 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9006 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9811 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.md new file mode 100644 index 0000000000..4332190850 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.md @@ -0,0 +1,103 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3641 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3583 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9006 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9811 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.md new file mode 100644 index 0000000000..8df34e8440 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.md @@ -0,0 +1,104 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder BgeBaseEn15 -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-flat-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3641 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3583 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9006 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9811 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..3d5923d375 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.md @@ -0,0 +1,110 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md new file mode 100644 index 0000000000..2ea343cd2d --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.md @@ -0,0 +1,110 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md new file mode 100644 index 0000000000..bba5fde61c --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.md @@ -0,0 +1,110 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-cached.topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md new file mode 100644 index 0000000000..75a507362d --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.md @@ -0,0 +1,110 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-bge-base-en-v1.5.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-bge-base-en-v1.5.parquet.tar` is 39 GB and has MD5 checksum `b235e19ec492c18a18057b30b8b23fd4`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx \ + --corpus-path collections/msmarco-passage-bge-base-en-v1.5.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-bge-base-en-v1.5.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-bge-base-en-v1.5.parquet & +``` + +The path `/path/to/msmarco-passage-bge-base-en-v1.5.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-bge-base-en-v1.5.parquet.bge-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **BGE-base-en-v1.5**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.364 | +| **RR@10** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.358 | +| **R@100** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.901 | +| **R@1000** | **BGE-base-en-v1.5**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.981 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md index a9caa19e0e..a0b4d2a387 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.md @@ -60,9 +60,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cohere-embed-english-v3.0/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md index 4b8b6e470a..a517118b69 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.md @@ -60,7 +60,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cohere-embed-english-v3.0/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..ffa149000e --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.md @@ -0,0 +1,103 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized flat quantized indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3716 | +| **RR@10** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3658 | +| **R@100** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.8935 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9786 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.md new file mode 100644 index 0000000000..715597e3b0 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.md @@ -0,0 +1,101 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-flat-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3716 | +| **RR@10** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3658 | +| **R@100** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.8935 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9786 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..169d0a6c69 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.md @@ -0,0 +1,108 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.372 | +| **RR@10** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.366 | +| **R@100** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.893 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.979 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md new file mode 100644 index 0000000000..fd7a37f619 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.md @@ -0,0 +1,108 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cohere-embed-english-v3.0.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cohere-embed-english-v3.0.parquet.tar` is 16 GB and has MD5 checksum `40c5caf33476746e93ceeb75174b8d64`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-cohere-embed-english-v3.0.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cohere-embed-english-v3.0.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cohere-embed-english-v3.0.parquet & +``` + +The path `/path/to/msmarco-passage-cohere-embed-english-v3.0.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cohere-embed-english-v3.0.parquet.cohere-embed-english-v3.0-hnsw-cached.topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cohere-embed-english-v3.0**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.372 | +| **RR@10** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.366 | +| **R@100** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.893 | +| **R@1000** | **cohere-embed-english-v3.0**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.979 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md index b1f08502e4..d4ff128e7c 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.md @@ -62,9 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md index 215b23ae04..93904dec5d 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.md @@ -62,9 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md index e508690834..e8fcd8adac 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.cached.md @@ -62,7 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md index dab73d2c7d..3486c77691 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.md @@ -62,7 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..afa7f33a02 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.md @@ -0,0 +1,105 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3942 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3896 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9075 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9796 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.md new file mode 100644 index 0000000000..d713d0668a --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.md @@ -0,0 +1,107 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder CosDprDistil -hits 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-int8-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3942 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3896 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9075 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9796 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.md new file mode 100644 index 0000000000..672ea80561 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.md @@ -0,0 +1,103 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3942 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3896 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9075 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9796 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.md new file mode 100644 index 0000000000..3febe0dcc2 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.md @@ -0,0 +1,106 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder CosDprDistil -hits 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-flat-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3942 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3896 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9075 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9796 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..02ef95c10f --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.md @@ -0,0 +1,112 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md new file mode 100644 index 0000000000..722d907c87 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.md @@ -0,0 +1,112 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-int8-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md new file mode 100644 index 0000000000..6bd00c57fd --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.md @@ -0,0 +1,112 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-cached.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md new file mode 100644 index 0000000000..4e156fe10c --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.md @@ -0,0 +1,112 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.parquet.tar` is 38 GB and has MD5 checksum `c8a204fbc3ccda581aa375936af43a97`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx \ + --corpus-path collections/msmarco-passage-cos-dpr-distil.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-cos-dpr-distil.parquet & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ + -topicReader TsvInt \ + -output runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.msmarco-passage.dev-subset.txt \ + -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 & +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.parquet.cos-dpr-distil-hnsw-onnx.topics.msmarco-passage.dev-subset.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distil**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.394 | +| **RR@10** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.390 | +| **R@100** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.908 | +| **R@1000** | **cosDPR-distil**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.980 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md index ecf2959c9c..7ed2ac4d18 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw-int8.cached.md @@ -62,9 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md index aec65c4415..94256c3a52 100644 --- a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.hnsw.cached.md @@ -62,7 +62,6 @@ bin/run.sh io.anserini.index.IndexHnswDenseVectors \ The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.md new file mode 100644 index 0000000000..29f213d104 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.md @@ -0,0 +1,105 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ \ + -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3505 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3434 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.8996 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9858 | + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat.cached.md new file mode 100644 index 0000000000..8ea76e5f1f --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.flat.cached.md @@ -0,0 +1,103 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.flat.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.flat.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.flat.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +bin/run.sh io.anserini.index.IndexFlatDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchFlatDenseVectors \ + -index indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -hits 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-flat-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3505 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3434 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.8996 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9858 | + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md new file mode 100644 index 0000000000..1429988ffb --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.md @@ -0,0 +1,111 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 100 -quantize.int8 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-int8-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.900 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.986 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md new file mode 100644 index 0000000000..2269960955 --- /dev/null +++ b/docs/regressions/regressions-msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.md @@ -0,0 +1,111 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw.cached +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw.cached +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar -P collections/ +tar xvf collections/msmarco-passage-openai-ada2.parquet.tar -C collections/ +``` + +To confirm, `msmarco-passage-openai-ada2.parquet.tar` is 75 GB and has MD5 checksum `fa3637e9c4150b157270e19ef3a4f779`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-v1-passage.openai-ada2.parquet.hnsw.cached \ + --corpus-path collections/msmarco-passage-openai-ada2.parquet +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +bin/run.sh io.anserini.index.IndexHnswDenseVectors \ + -threads 16 \ + -collection ParquetDenseVectorCollection \ + -input /path/to/msmarco-passage-openai-ada2.parquet \ + -generator ParquetDenseVectorDocumentGenerator \ + -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ + -M 16 -efC 100 \ + >& logs/log.msmarco-passage-openai-ada2.parquet & +``` + +The path `/path/to/msmarco-passage-openai-ada2.parquet/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +bin/run.sh io.anserini.search.SearchHnswDenseVectors \ + -index indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \ + -topicReader JsonIntVector \ + -output runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \ + -hits 1000 -efSearch 1000 -threads 16 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +bin/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +bin/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.parquet.openai-ada2-hnsw-cached.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **OpenAI-ada2**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 | +| **RR@10** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 | +| **R@100** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.900 | +| **R@1000** | **OpenAI-ada2**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.986 | + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.template) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.cached.template index 6f1a137b72..62ef565aef 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.onnx.template index 50f6e7e051..d5c1eeb0df 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat-int8.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.cached.template index 38ce981038..e0a404c162 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.onnx.template index 97d2339fa2..41c190ca8b 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.flat.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template index 11d7f8a4c6..10c58940e4 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template index dcd7a050b0..7d69fb6d1c 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw-int8.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template index d258062b18..9c87aa3e9c 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template index 296f1c195f..c373ed11ba 100644 --- a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.hnsw.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..cfc6aab4be --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template new file mode 100644 index 0000000000..d2521082fc --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.template new file mode 100644 index 0000000000..3628965a2d --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.template @@ -0,0 +1,93 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.template new file mode 100644 index 0000000000..d6c6909259 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..570862e6c1 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template new file mode 100644 index 0000000000..b505ca28f2 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.template new file mode 100644 index 0000000000..8e7222aed7 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template new file mode 100644 index 0000000000..a65ab6ee31 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template index 5ed06e455a..4b2fb337fa 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template @@ -53,7 +53,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template index 5bfb8df923..8e7fc7a345 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.hnsw.cached.template @@ -53,7 +53,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..d6878d6f5d --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template @@ -0,0 +1,89 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.template new file mode 100644 index 0000000000..93a902334f --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.template @@ -0,0 +1,88 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..27bb713495 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template @@ -0,0 +1,90 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template new file mode 100644 index 0000000000..12a4e07c08 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template @@ -0,0 +1,90 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template index c55d960745..ba15af2230 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.cached.template @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template index d9429c26e2..49cc5ff8ba 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw-int8.onnx.template @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template index 41405cdc75..01f1de1cdc 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.cached.template @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template index d9a0c9b14f..7ad66ac94f 100644 --- a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.hnsw.onnx.template @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..9f67f9423a --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.template new file mode 100644 index 0000000000..26ef4092c7 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.template @@ -0,0 +1,96 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.cached.template new file mode 100644 index 0000000000..cc7d902f6f --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.cached.template @@ -0,0 +1,93 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.onnx.template new file mode 100644 index 0000000000..38fd8df4b2 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.flat.onnx.template @@ -0,0 +1,96 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..4daae2e9ee --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template new file mode 100644 index 0000000000..7cce1427df --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template @@ -0,0 +1,97 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.template new file mode 100644 index 0000000000..8975af68cc --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.template b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.template new file mode 100644 index 0000000000..c1c2bccac1 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.template @@ -0,0 +1,97 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template index 045df030c4..f6337c8157 100644 --- a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw-int8.cached.template @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template index 50add8dc05..cd90b0ec9b 100644 --- a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.hnsw.cached.template @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..f79851f972 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat-int8.cached.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat.cached.template new file mode 100644 index 0000000000..839f78a07a --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.flat.cached.template @@ -0,0 +1,93 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..ad76176266 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw.cached.template new file mode 100644 index 0000000000..d835f4ff1d --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage.openai-ada2.parquet.hnsw.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.cached.template index 16d2bf2619..103072c5b1 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.onnx.template index de5f36d973..de783b7f24 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat-int8.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.cached.template index 76adbf78ba..a892532573 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.onnx.template index 80f5f169a9..b4aa58e6d1 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.flat.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template index 9567903001..405dc08416 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template index b197088a4c..cc9e3721ea 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw-int8.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template index 18fdd1ebb0..06c06139e7 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.cached.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template index 6a81774acd..3d422266f1 100644 --- a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.hnsw.onnx.template @@ -20,7 +20,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..dec3d4c057 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template new file mode 100644 index 0000000000..922a005a19 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.template new file mode 100644 index 0000000000..8693cec8d0 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.template @@ -0,0 +1,93 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.template new file mode 100644 index 0000000000..1e823fb318 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..82170e0b8c --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template new file mode 100644 index 0000000000..fee66c30a9 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.template new file mode 100644 index 0000000000..96c71735f5 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template new file mode 100644 index 0000000000..167943b562 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template index d65eba96b3..d3e7c80c39 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template @@ -53,7 +53,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template index 9b40bcf5a0..20c9ffd112 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.hnsw.cached.template @@ -53,7 +53,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..6e3fcf0f2e --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template @@ -0,0 +1,89 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.template new file mode 100644 index 0000000000..69270c2cc6 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.template @@ -0,0 +1,88 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..058cd13518 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template @@ -0,0 +1,90 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template new file mode 100644 index 0000000000..c2834b467f --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template @@ -0,0 +1,90 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2020.html). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template index 2bfaf45fa9..f0a2c03b58 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.cached.template @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template index 0cd21c0e65..4e1ada541e 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw-int8.onnx.template @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template index 95e75285aa..f8d68a2125 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.cached.template @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template index fc0256d56c..b1bb67a9c4 100644 --- a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.hnsw.onnx.template @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..d3d7f83435 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.template new file mode 100644 index 0000000000..5862f1c43d --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.template @@ -0,0 +1,96 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.cached.template new file mode 100644 index 0000000000..4f004115c9 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.cached.template @@ -0,0 +1,93 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.onnx.template new file mode 100644 index 0000000000..1b7469a976 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.flat.onnx.template @@ -0,0 +1,96 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..56812875c2 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template new file mode 100644 index 0000000000..3269597346 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template @@ -0,0 +1,97 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.template new file mode 100644 index 0000000000..602d59f611 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.template b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.template new file mode 100644 index 0000000000..f75f085b30 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.template @@ -0,0 +1,97 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template index 7687266f87..8c4a35adff 100644 --- a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw-int8.cached.template @@ -58,9 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template index c2d9305819..10ba767431 100644 --- a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.hnsw.cached.template @@ -58,7 +58,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..d81636da9b --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat-int8.cached.template @@ -0,0 +1,94 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat.cached.template new file mode 100644 index 0000000000..662566dec8 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.flat.cached.template @@ -0,0 +1,93 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..e95d374dc4 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw.cached.template new file mode 100644 index 0000000000..b9a1b9b9da --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage.openai-ada2.parquet.hnsw.cached.template @@ -0,0 +1,95 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: OpenAI-ada2 embeddings with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +❗ Retrieval metrics here are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +For computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2102.07662). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.template index 906d9c6098..c800dea1a1 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.cached.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.template index eac1c724a3..809a6374dc 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat-int8.onnx.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.cached.template index c89aecbad7..1ab8daf49c 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.cached.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.template index d65193849b..2ff10818ce 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.flat.onnx.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template index 4758ea5060..45f441d89a 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.cached.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -55,9 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template index 2052635a6b..5ebb774c4c 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw-int8.onnx.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -55,9 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template index 8689ff2e07..88882e2646 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.cached.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -55,7 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template index b6818458b6..458933df51 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.hnsw.onnx.template @@ -17,7 +17,7 @@ From one of our Waterloo servers (e.g., `orca`), the following command will perf python src/main/python/run_regression.py --index --verify --search --regression ${test_name} ``` -We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. From any machine, the following command will download the corpus and perform the complete regression, end to end: @@ -55,7 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..0a2bf5305c --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.template @@ -0,0 +1,82 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template new file mode 100644 index 0000000000..0ce5e5c895 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.template @@ -0,0 +1,82 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.template new file mode 100644 index 0000000000..358ba61628 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.template @@ -0,0 +1,81 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.template new file mode 100644 index 0000000000..1e31ba1f5e --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.template @@ -0,0 +1,82 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..4368730ee1 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.template @@ -0,0 +1,87 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template new file mode 100644 index 0000000000..2ab8035a76 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.template @@ -0,0 +1,87 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.template new file mode 100644 index 0000000000..5a625c8f89 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.template @@ -0,0 +1,87 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template new file mode 100644 index 0000000000..76d0077499 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.template @@ -0,0 +1,87 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [BGE-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Shitao Xiao, Zheng Liu, Peitian Zhang, and Niklas Muennighoff. [C-Pack: Packaged Resources To Advance General Chinese Embedding.](https://arxiv.org/abs/2309.07597) _arXiv:2309.07597_, 2023. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded by the BGE-base-en-v1.5 model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 39 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template index 18b91d7fa3..ed92dcf9b4 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw-int8.cached.template @@ -53,9 +53,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template index 0799aa04b5..965646da7d 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.hnsw.cached.template @@ -53,7 +53,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..81a42998b0 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.template @@ -0,0 +1,80 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized flat quantized indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.template new file mode 100644 index 0000000000..bc054fd386 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.template @@ -0,0 +1,79 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..57a3dad08d --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.template @@ -0,0 +1,85 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template new file mode 100644 index 0000000000..cf557a9810 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.template @@ -0,0 +1,85 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the [Cohere embed-english-v3.0](https://docs.cohere.com/reference/embed) model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with Cohere embed-english-v3.0. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 16 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template index 5a915df835..f59494a8d5 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.cached.template @@ -55,9 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template index ed45e4ec38..af73110fcd 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw-int8.onnx.template @@ -55,9 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template index 635b1f86a7..931a825d73 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.cached.template @@ -55,7 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template index 6f5f60a2da..9e7fdf7232 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.hnsw.onnx.template @@ -55,7 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..d9062c8266 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.template @@ -0,0 +1,82 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.template new file mode 100644 index 0000000000..9eb1eef79f --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.template @@ -0,0 +1,84 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.template new file mode 100644 index 0000000000..70ae6d1f16 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.template @@ -0,0 +1,81 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.template new file mode 100644 index 0000000000..26c995351d --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.template @@ -0,0 +1,84 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with flat indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With ONNX query encoding on non-quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..09f107bf00 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.template @@ -0,0 +1,89 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template new file mode 100644 index 0000000000..1d8e436bdd --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.template @@ -0,0 +1,89 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with quantized HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.template new file mode 100644 index 0000000000..05c230b689 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.template @@ -0,0 +1,89 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.template b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.template new file mode 100644 index 0000000000..087be53e89 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.template @@ -0,0 +1,89 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom. + +In these experiments, we are performing query inference "on-the-fly" with ONNX. + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 38 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Note that we are performing query inference "on-the-fly" with ONNX in these experiments. + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With ONNX query encoding on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template index eb88ab10cd..60ffe31a17 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw-int8.cached.template @@ -55,9 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. -Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory. -See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template index 6d7a43a574..ad263299d9 100644 --- a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.hnsw.cached.template @@ -55,7 +55,6 @@ ${index_cmds} The path `/path/to/${corpus}/` should point to the corpus downloaded above. Upon completion, we should have an index with 8,841,823 documents. - ## Retrieval Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.template new file mode 100644 index 0000000000..cbf8f062fe --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.template @@ -0,0 +1,82 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with quantized flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized indexes. +With cached queries on quantized indexes, results may differ slightly. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat.cached.template new file mode 100644 index 0000000000..ea9283a53d --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.flat.cached.template @@ -0,0 +1,81 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with flat indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building flat indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that since we're running brute-force search with cached queries on non-quantized indexes, the results should be reproducible _exactly_. diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.template new file mode 100644 index 0000000000..b594b90c6e --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.template @@ -0,0 +1,88 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with quantized HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building quantized HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that both HNSW indexing and quantization are non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.template b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.template new file mode 100644 index 0000000000..89dfb95fd1 --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.template @@ -0,0 +1,88 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: OpenAI-ada2 embeddings with HNSW indexes (using cached queries) + +This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper: + +> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023. + +In these experiments, we are using cached queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 75 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, building HNSW indexes: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +The above figures are from running brute-force search with cached queries on non-quantized **flat** indexes. +With cached queries on non-quantized HNSW indexes, observed results are likely to differ; scores may be lower by up to 0.01, sometimes more. +Note that HNSW indexing is non-deterministic (i.e., results may differ slightly between trials). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..420d078d80 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: bge-flat-int8-cached + display: BGE-base-en-v1.5 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.004 + R@100: + - 0.007 + R@1000: + - 0.002 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml new file mode 100644 index 0000000000..7d7bfe3405 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: bge-flat-int8-onnx + display: BGE-base-en-v1.5 + type: flat + params: -encoder BgeBaseEn15 -hits 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 + tolerance: + AP@1000: + - 0.004 + nDCG@10: + - 0.006 + R@100: + - 0.007 + R@1000: + - 0.005 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.yaml new file mode 100644 index 0000000000..e8c2a9f724 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: bge-flat-cached + display: BGE-base-en-v1.5 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml new file mode 100644 index 0000000000..5985695619 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: bge-flat-onnx + display: BGE-base-en-v1.5 + type: flat + params: -encoder BgeBaseEn15 -hits 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 + tolerance: + AP@1000: + - 0.006 + nDCG@10: + - 0.005 + R@100: + - 0.008 + R@1000: + - 0.005 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..3d2479b91e --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: bge-hnsw-int8-cached + display: BGE-base-en-v1.5 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.02 + R@100: + - 0.03 + R@1000: + - 0.03 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml new file mode 100644 index 0000000000..f2612d3e30 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: bge-hnsw-int8-onnx + display: BGE-base-en-v1.5 + type: hnsw + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 + tolerance: + AP@1000: + - 0.01 + nDCG@10: + - 0.02 + R@100: + - 0.025 + R@1000: + - 0.03 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..da5a012c5b --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: bge-hnsw-cached + display: BGE-base-en-v1.5 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 + tolerance: + AP@1000: + - 0.008 + nDCG@10: + - 0.009 + R@100: + - 0.009 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml new file mode 100644 index 0000000000..282ff91417 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: bge-hnsw-onnx + display: BGE-base-en-v1.5 + type: hnsw + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4435 + nDCG@10: + - 0.7065 + R@100: + - 0.6171 + R@1000: + - 0.8472 + tolerance: + AP@1000: + - 0.002 + nDCG@10: + - 0.015 + R@100: + - 0.02 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..806564d718 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cohere-embed-english-v3.0-flat-int8-cached + display: cohere-embed-english-v3.0 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4884 + nDCG@10: + - 0.6956 + R@100: + - 0.6484 + R@1000: + - 0.8630 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.002 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml new file mode 100644 index 0000000000..ffc5fe3d15 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cohere-embed-english-v3.0-flat-cached + display: cohere-embed-english-v3.0 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4884 + nDCG@10: + - 0.6956 + R@100: + - 0.6484 + R@1000: + - 0.8630 \ No newline at end of file diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..7c5d00330d --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cohere-embed-english-v3.0-hnsw-int8-cached + display: cohere-embed-english-v3.0 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4884 + nDCG@10: + - 0.6956 + R@100: + - 0.6484 + R@1000: + - 0.8630 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.02 + R@100: + - 0.02 + R@1000: + - 0.035 diff --git a/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..925ed65461 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cohere-embed-english-v3.0-hnsw-cached + display: cohere-embed-english-v3.0 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4884 + nDCG@10: + - 0.6956 + R@100: + - 0.6484 + R@1000: + - 0.8630 + tolerance: + AP@1000: + - 0.01 + nDCG@10: + - 0.015 + R@100: + - 0.015 + R@1000: + - 0.03 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..9f45de1471 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-flat-int8-cached + display: cosDPR-distil + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.003 + R@100: + - 0.003 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml new file mode 100644 index 0000000000..fe80abf105 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-flat-int8-onnx + display: cosDPR-distil + type: flat + params: -encoder CosDprDistil -hits 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.002 + R@100: + - 0.002 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.cached.yaml new file mode 100644 index 0000000000..dd3d34a741 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-flat-cached + display: cosDPR-distil + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.onnx.yaml new file mode 100644 index 0000000000..82c2c3b78f --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.flat.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-flat-onnx + display: cosDPR-distil + type: flat + params: -encoder CosDprDistil -hits 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.001 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..3c725674c3 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw-int8-cached + display: cosDPR-distil + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 + tolerance: + AP@1000: + - 0.02 + nDCG@10: + - 0.025 + R@100: + - 0.025 + R@1000: + - 0.03 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml new file mode 100644 index 0000000000..aaf4e0514b --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw-int8-onnx + display: cosDPR-distil + type: hnsw + params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 + tolerance: + AP@1000: + - 0.02 + nDCG@10: + - 0.025 + R@100: + - 0.025 + R@1000: + - 0.03 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..0a7eec9f20 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw-cached + display: cosDPR-distil + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.025 + R@100: + - 0.02 + R@1000: + - 0.025 diff --git a/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml new file mode 100644 index 0000000000..4fb88509f9 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.txt + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-hnsw-onnx + display: cosDPR-distil + type: hnsw + params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4656 + nDCG@10: + - 0.7250 + R@100: + - 0.6173 + R@1000: + - 0.8201 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.025 + R@100: + - 0.02 + R@1000: + - 0.025 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..0539d9a5cf --- /dev/null +++ b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.openai-ada2.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: openai-ada2-flat-int8-cached + display: OpenAI-ada2 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4788 + nDCG@10: + - 0.7035 + R@100: + - 0.6235 + R@1000: + - 0.8629 + tolerance: + AP@1000: + - 0.002 + nDCG@10: + - 0.002 + R@100: + - 0.007 + R@1000: + - 0.008 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat.cached.yaml new file mode 100644 index 0000000000..d1f247ccb0 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.openai-ada2.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: openai-ada2-flat-cached + display: OpenAI-ada2 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4788 + nDCG@10: + - 0.7035 + R@100: + - 0.6235 + R@1000: + - 0.8629 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..32e2d8ba27 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.openai-ada2.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: openai-ada2-hnsw-int8-cached + display: OpenAI-ada2 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4788 + nDCG@10: + - 0.7035 + R@100: + - 0.6235 + R@1000: + - 0.8629 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.015 + R@100: + - 0.015 + R@1000: + - 0.015 diff --git a/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..3734168958 --- /dev/null +++ b/src/main/resources/regression/dl19-passage.openai-ada2.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.openai-ada2.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: openai-ada2-hnsw-cached + display: OpenAI-ada2 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4788 + nDCG@10: + - 0.7035 + R@100: + - 0.6235 + R@1000: + - 0.8629 + tolerance: + AP@1000: + - 0.002 + nDCG@10: + - 0.004 + R@100: + - 0.005 + R@1000: + - 0.009 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..a61f22aa45 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: bge-flat-int8-cached + display: BGE-base-en-v1.5 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 + tolerance: + AP@1000: + - 0.003 + nDCG@10: + - 0.006 + R@100: + - 0.005 + R@1000: + - 0.002 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml new file mode 100644 index 0000000000..578c8cb567 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: bge-flat-int8-onnx + display: BGE-base-en-v1.5 + type: flat + params: -encoder BgeBaseEn15 -hits 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 + tolerance: + AP@1000: + - 0.004 + nDCG@10: + - 0.003 + R@100: + - 0.004 + R@1000: + - 0.003 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.yaml new file mode 100644 index 0000000000..d9b1c290f8 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: bge-flat-cached + display: BGE-base-en-v1.5 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml new file mode 100644 index 0000000000..0c6388fca9 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: bge-flat-onnx + display: BGE-base-en-v1.5 + type: flat + params: -encoder BgeBaseEn15 -hits 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 + tolerance: + AP@1000: + - 0.003 + nDCG@10: + - 0.002 + R@100: + - 0.003 + R@1000: + - 0.005 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..581fb9431d --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: bge-hnsw-int8-cached + display: BGE-base-en-v1.5 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 + tolerance: + AP@1000: + - 0.01 + nDCG@10: + - 0.01 + R@100: + - 0.02 + R@1000: + - 0.03 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml new file mode 100644 index 0000000000..0481321c69 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: bge-hnsw-int8-onnx + display: BGE-base-en-v1.5 + type: hnsw + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.008 + R@100: + - 0.02 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..07d3f015ec --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.bge-base-en-v1.5.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: bge-hnsw-cached + display: BGE-base-en-v1.5 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 + tolerance: + AP@1000: + - 0.003 + nDCG@10: + - 0.001 + R@100: + - 0.009 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml new file mode 100644 index 0000000000..731d5ce634 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: bge-hnsw-onnx + display: BGE-base-en-v1.5 + type: hnsw + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4650 + nDCG@10: + - 0.6780 + R@100: + - 0.7169 + R@1000: + - 0.8503 + tolerance: + AP@1000: + - 0.005 + nDCG@10: + - 0.002 + R@100: + - 0.01 + R@1000: + - 0.01 diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..e0483677d7 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cohere-embed-english-v3.0-flat-int8-cached + display: cohere-embed-english-v3.0 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.5067 + nDCG@10: + - 0.7245 + R@100: + - 0.7279 + R@1000: + - 0.8682 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.004 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml new file mode 100644 index 0000000000..a54f120a76 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cohere-embed-english-v3.0-flat-cached + display: cohere-embed-english-v3.0 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.5067 + nDCG@10: + - 0.7245 + R@100: + - 0.7279 + R@1000: + - 0.8682 \ No newline at end of file diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..f3bbfe5647 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cohere-embed-english-v3.0-hnsw-int8-cached + display: cohere-embed-english-v3.0 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.5067 + nDCG@10: + - 0.7245 + R@100: + - 0.7279 + R@1000: + - 0.8682 + tolerance: + AP@1000: + - 0.0057 + nDCG@10: + - 0.007 + R@100: + - 0.02 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..9e5a745f4d --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cohere-embed-english-v3.0-hnsw-cached + display: cohere-embed-english-v3.0 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.5067 + nDCG@10: + - 0.7245 + R@100: + - 0.7279 + R@1000: + - 0.8682 + tolerance: + AP@1000: + - 0.005 + nDCG@10: + - 0.001 + R@100: + - 0.015 + R@1000: + - 0.025 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..b0de920ca4 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-flat-int8-cached + display: cosDPR-distil + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 + tolerance: + AP@1000: + - 0.002 + nDCG@10: + - 0.005 + R@100: + - 0.004 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml new file mode 100644 index 0000000000..e60a433f4c --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-flat-int8-onnx + display: cosDPR-distil + type: flat + params: -encoder CosDprDistil -hits 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.005 + R@100: + - 0.004 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.cached.yaml new file mode 100644 index 0000000000..186ab03d9f --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-flat-cached + display: cosDPR-distil + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.onnx.yaml new file mode 100644 index 0000000000..493bc1eea5 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.flat.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-flat-onnx + display: cosDPR-distil + type: flat + params: -encoder CosDprDistil -hits 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.001 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..8a2cfafede --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw-int8-cached + display: cosDPR-distil + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 + tolerance: + AP@1000: + - 0.009 + nDCG@10: + - 0.006 + R@100: + - 0.02 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml new file mode 100644 index 0000000000..2d55327e76 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw-int8-onnx + display: cosDPR-distil + type: hnsw + params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 + tolerance: + AP@1000: + - 0.009 + nDCG@10: + - 0.006 + R@100: + - 0.02 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..20bd4677a7 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw-cached + display: cosDPR-distil + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.008 + R@100: + - 0.025 + R@1000: + - 0.025 diff --git a/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml new file mode 100644 index 0000000000..c3a86458e0 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.txt + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-hnsw-onnx + display: cosDPR-distil + type: hnsw + params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4876 + nDCG@10: + - 0.7025 + R@100: + - 0.7204 + R@1000: + - 0.8533 + tolerance: + AP@1000: + - 0.015 + nDCG@10: + - 0.008 + R@100: + - 0.025 + R@1000: + - 0.025 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat-int8.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..dd3de04a90 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.openai-ada2.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: openai-ada2-flat-int8-cached + display: OpenAI-ada2 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4771 + nDCG@10: + - 0.6759 + R@100: + - 0.7237 + R@1000: + - 0.8705 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.002 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat.cached.yaml new file mode 100644 index 0000000000..fd0fa79265 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.openai-ada2.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: openai-ada2-flat-cached + display: OpenAI-ada2 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.4771 + nDCG@10: + - 0.6759 + R@100: + - 0.7237 + R@1000: + - 0.8705 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..0e529f7ba5 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.openai-ada2.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: openai-ada2-hnsw-int8-cached + display: OpenAI-ada2 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4771 + nDCG@10: + - 0.6759 + R@100: + - 0.7237 + R@1000: + - 0.8705 + tolerance: + AP@1000: + - 0.008 + nDCG@10: + - 0.015 + R@100: + - 0.015 + R@1000: + - 0.015 diff --git a/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..66f9859309 --- /dev/null +++ b/src/main/resources/regression/dl20-passage.openai-ada2.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: bin/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.openai-ada2.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: openai-ada2-hnsw-cached + display: OpenAI-ada2 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.4771 + nDCG@10: + - 0.6759 + R@100: + - 0.7237 + R@1000: + - 0.8705 + tolerance: + AP@1000: + - 0.001 + nDCG@10: + - 0.001 + R@100: + - 0.003 + R@1000: + - 0.009 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..6535395862 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-flat-int8-cached + display: BGE-base-en-v1.5 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 + tolerance: + AP@1000: + - 0.002 + RR@10: + - 0.002 + R@100: + - 0.002 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml new file mode 100644 index 0000000000..acaff45595 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-flat-int8-onnx + display: BGE-base-en-v1.5 + type: flat + params: -encoder BgeBaseEn15 -hits 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 + tolerance: + AP@1000: + - 0.002 + RR@10: + - 0.002 + R@100: + - 0.002 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.yaml new file mode 100644 index 0000000000..6b72f68ed4 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-flat-cached + display: BGE-base-en-v1.5 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml new file mode 100644 index 0000000000..acbbccc3de --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.flat.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-flat.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-flat-onnx + display: BGE-base-en-v1.5 + type: flat + params: -encoder BgeBaseEn15 -hits 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 + tolerance: + AP@1000: + - 0.001 + RR@10: + - 0.001 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..1c112716d6 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-hnsw-int8-cached + display: BGE-base-en-v1.5 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 + tolerance: + AP@1000: + - 0.004 + RR@10: + - 0.004 + R@100: + - 0.01 + R@1000: + - 0.015 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml new file mode 100644 index 0000000000..f19d31534b --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-hnsw-int8-onnx + display: BGE-base-en-v1.5 + type: hnsw + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 + tolerance: + AP@1000: + - 0.004 + RR@10: + - 0.004 + R@100: + - 0.01 + R@1000: + - 0.015 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..7a1a6fb79e --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.bge-base-en-v1.5.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-hnsw-cached + display: BGE-base-en-v1.5 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 + tolerance: + AP@1000: + - 0.002 + RR@10: + - 0.002 + R@100: + - 0.008 + R@1000: + - 0.01 diff --git a/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml new file mode 100644 index 0000000000..7c18399582 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.bge-base-en-v1.5.parquet.hnsw.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-bge-base-en-v1.5.parquet +corpus_path: collections/msmarco/msmarco-passage-bge-base-en-v1.5.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-bge-base-en-v1.5.parquet.tar +download_checksum: b235e19ec492c18a18057b30b8b23fd4 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.bge-base-en-v1.5/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: bge-hnsw-onnx + display: BGE-base-en-v1.5 + type: hnsw + params: -encoder BgeBaseEn15 -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3641 + RR@10: + - 0.3583 + R@100: + - 0.9006 + R@1000: + - 0.9811 + tolerance: + AP@1000: + - 0.002 + RR@10: + - 0.002 + R@100: + - 0.008 + R@1000: + - 0.01 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..6eeeaf4645 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cohere-embed-english-v3.0-flat-int8-cached + display: cohere-embed-english-v3.0 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3716 + RR@10: + - 0.3658 + R@100: + - 0.8935 + R@1000: + - 0.9786 + tolerance: + AP@1000: + - 0.003 + RR@10: + - 0.003 + R@100: + - 0.003 + R@1000: + - 0.009 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml new file mode 100644 index 0000000000..194f553d6f --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cohere-embed-english-v3.0-flat-cached + display: cohere-embed-english-v3.0 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3716 + RR@10: + - 0.3658 + R@100: + - 0.8935 + R@1000: + - 0.9786 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..a40eb51e2a --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cohere-embed-english-v3.0-hnsw-int8-cached + display: cohere-embed-english-v3.0 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3716 + RR@10: + - 0.3658 + R@100: + - 0.8935 + R@1000: + - 0.9786 + tolerance: + AP@1000: + - 0.004 + RR@10: + - 0.004 + R@100: + - 0.01 + R@1000: + - 0.01 diff --git a/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..95a5c5d939 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cohere-embed-english-v3.0.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cohere-embed-english-v3.0.parquet +corpus_path: collections/msmarco/msmarco-passage-cohere-embed-english-v3.0.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cohere-embed-english-v3.0.parquet.tar +download_checksum: 40c5caf33476746e93ceeb75174b8d64 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cohere-embed-english-v3.0/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cohere-embed-english-v3.0.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cohere-embed-english-v3.0-hnsw-cached + display: cohere-embed-english-v3.0 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3716 + RR@10: + - 0.3658 + R@100: + - 0.8935 + R@1000: + - 0.9786 + tolerance: + AP@1000: + - 0.004 + RR@10: + - 0.004 + R@100: + - 0.015 + R@1000: + - 0.015 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..86987a803e --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-flat-int8-cached + display: cosDPR-distil + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 + tolerance: + AP@1000: + - 0.001 + RR@10: + - 0.001 + R@100: + - 0.002 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml new file mode 100644 index 0000000000..9ffb6997f8 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-flat-int8-onnx + display: cosDPR-distil + type: flat + params: -encoder CosDprDistil -hits 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 + tolerance: + AP@1000: + - 0.001 + RR@10: + - 0.001 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.yaml new file mode 100644 index 0000000000..553e5dbd9d --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-flat-cached + display: cosDPR-distil + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.yaml new file mode 100644 index 0000000000..631883df3f --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.flat.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-flat.msmarco-v1-passage.cos-dpr-distil/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-flat-onnx + display: cosDPR-distil + type: flat + params: -encoder CosDprDistil -hits 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 + tolerance: + AP@1000: + - 0.001 + RR@10: + - 0.001 + R@100: + - 0.001 + R@1000: + - 0.001 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..7d1e8aa3dd --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw-int8-cached + display: cosDPR-distil + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 + tolerance: + AP@1000: + - 0.003 + RR@10: + - 0.003 + R@100: + - 0.01 + R@1000: + - 0.015 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml new file mode 100644 index 0000000000..7f0ade3d05 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw-int8.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw-int8-onnx + display: cosDPR-distil + type: hnsw + params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 + tolerance: + AP@1000: + - 0.003 + RR@10: + - 0.003 + R@100: + - 0.01 + R@1000: + - 0.015 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..ab5ec4e57e --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw-cached + display: cosDPR-distil + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 + tolerance: + AP@1000: + - 0.004 + RR@10: + - 0.004 + R@100: + - 0.015 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml new file mode 100644 index 0000000000..e7e91a8d6e --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.cos-dpr-distil.parquet.hnsw.onnx.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-cos-dpr-distil.parquet +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.parquet.tar +download_checksum: c8a204fbc3ccda581aa375936af43a97 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.cos-dpr-distil/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: TsvInt +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.txt + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-hnsw-onnx + display: cosDPR-distil + type: hnsw + params: -encoder CosDprDistil -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3942 + RR@10: + - 0.3896 + R@100: + - 0.9075 + R@1000: + - 0.9796 + tolerance: + AP@1000: + - 0.005 + RR@10: + - 0.004 + R@100: + - 0.015 + R@1000: + - 0.02 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.yaml new file mode 100644 index 0000000000..38cb8f8cca --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-flat-int8.msmarco-v1-passage.openai-ada2/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: openai-ada2-flat-int8-cached + display: OpenAI-ada2 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3505 + RR@10: + - 0.3434 + R@100: + - 0.8996 + R@1000: + - 0.9858 + tolerance: + AP@1000: + - 0.008 + RR@10: + - 0.009 + R@100: + - 0.006 + R@1000: + - 0.002 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat.cached.yaml new file mode 100644 index 0000000000..e2872aee85 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.flat.cached.yaml @@ -0,0 +1,65 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-flat.msmarco-v1-passage.openai-ada2/ +index_type: flat +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: "" + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: openai-ada2-flat-cached + display: OpenAI-ada2 + type: flat + params: -hits 1000 -threads 16 + results: + AP@1000: + - 0.3505 + RR@10: + - 0.3434 + R@100: + - 0.8996 + R@1000: + - 0.9858 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.yaml new file mode 100644 index 0000000000..fee6401b8a --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw-int8.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-hnsw-int8.msmarco-v1-passage.openai-ada2/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 -quantize.int8 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: openai-ada2-hnsw-int8-cached + display: OpenAI-ada2 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3505 + RR@10: + - 0.3434 + R@100: + - 0.8996 + R@1000: + - 0.9858 + tolerance: + AP@1000: + - 0.015 + RR@10: + - 0.015 + R@100: + - 0.01 + R@1000: + - 0.006 diff --git a/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml new file mode 100644 index 0000000000..65967d7b38 --- /dev/null +++ b/src/main/resources/regression/msmarco-v1-passage.openai-ada2.parquet.hnsw.cached.yaml @@ -0,0 +1,74 @@ +--- +corpus: msmarco-passage-openai-ada2.parquet +corpus_path: collections/msmarco/msmarco-passage-openai-ada2.parquet/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.parquet.tar +download_checksum: fa3637e9c4150b157270e19ef3a4f779 + +index_path: indexes/lucene-hnsw.msmarco-v1-passage.openai-ada2/ +index_type: hnsw +collection_class: ParquetDenseVectorCollection +generator_class: ParquetDenseVectorDocumentGenerator +index_threads: 16 +index_options: -M 16 -efC 100 + +metrics: + - metric: AP@1000 + command: bin/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: bin/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: bin/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: bin/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: openai-ada2-hnsw-cached + display: OpenAI-ada2 + type: hnsw + params: -hits 1000 -efSearch 1000 -threads 16 + results: + AP@1000: + - 0.3505 + RR@10: + - 0.3434 + R@100: + - 0.8996 + R@1000: + - 0.9858 + tolerance: + AP@1000: + - 0.002 + RR@10: + - 0.002 + R@100: + - 0.005 + R@1000: + - 0.005