diff --git a/docs/experiments-arctic.md b/docs/experiments-arctic.md new file mode 100644 index 000000000..1db71e011 --- /dev/null +++ b/docs/experiments-arctic.md @@ -0,0 +1,50 @@ +# Pyserini: Reproducing Arctic Results + +## MS marco v2.1 doc +In order to handle msmarco v2.1 dataset's large size, we have the indexes divided in two partitions. Thus we need to perform retrieval runs for both of the indexes. + +```bash +python -m pyserini.search.faiss --index /store/scratch/sjupadhy/indexes/msmarco-v2.1-snowflake-arctic-embed-l-1 \ +--topics /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/topics.msmarco-v2-doc.dev.json \ +--encoded-queries /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l/ \ +--output run.msmarco-v2.1-doc.arctic-embed-l-1.dev.txt \ +--hits 2000 --threads 16 --batch-size 128 + + +python -m pyserini.search.faiss --index /store/scratch/sjupadhy/indexes/msmarco-v2.1-snowflake-arctic-embed-l-2 \ +--topics /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/topics.msmarco-v2-doc.dev.json \ +--encoded-queries /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l/ \ +--output run.msmarco-v2.1-doc.arctic-embed-l-2.dev.txt \ +--hits 2000 --threads 16 --batch-size 128 +``` + +### Merging and compiling docwise results +As the available embeddings refer to doc segments, we need to complile doc wise results. Thus we merge and compile them with: +```bash +python scripts/arctic/merge_retrieved_results.py --arctic_run_folder arctic_runs \ +--output_file run.msmarco-v2.1-doc.arctic-embed-l-merged.dev.txt \ +--k 1000 +``` + +### Evaluation +```bash +python -m pyserini.eval.trec_eval -c -m recall.1000 -m recall.100 -m ndcg_cut.10 /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/qrels/qrels_qrels.msmarco-v2.1-doc.dev.txt run.msmarco-v2.1-doc.arctic-embed-l-merged.dev.txt +Results: +recall_1000 all 0.9408 +recall_100 all 0.8513 +ndcg_cut_10 all 0.3583 +``` + +## BEIR +Retrieval run on NQ subdataset: +```bash +python -m pyserini.search.faiss --threads 16 --batch-size 512 --index beir-v1.0.0-nq.arctic-embed-m-v1.5 --topics beir-v1.0.0-nq-test --encoded-queries snowflake-arctic-embed-m-v1.5-beir-v1.0.0-nq-test --output run.beir.arctic-embed.nq.txt --hits 1000 +``` + +### Evaluation +```bash +python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -m recall.1000 -m recall.100 beir-v1.0.0-nq-test run.beir.arctic-embed.nq.txt +Results: +recall_1000 all 0.9951 +ndcg_cut_10 all 0.6244 +``` \ No newline at end of file diff --git a/pyserini/encoded_query_info.py b/pyserini/encoded_query_info.py index e3dabcb06..5fd84e757 100644 --- a/pyserini/encoded_query_info.py +++ b/pyserini/encoded_query_info.py @@ -1034,5 +1034,15 @@ "size (bytes)": 135674, "total_queries": 49, "downloaded": False + }, + "snowflake-arctic-embed-m-v1.5-beir-v1.0.0-nq-test": { + "description": "BEIR v1.0.0 nq test queries encoded by Snowflake arctic-embed-m-v1.5.", + "urls": [ + "https://www.dropbox.com/scl/fi/i3du2elac5gxlvn595ioc/query-embedding-snowflake-arctic-embed-m-v1.5-beir-v1.0.0-nq-test-20240908.tar.gz?dl=1&rlkey=wkwfg8qpputewgixj7k7osxdv&st=zdlh9nrz" + ], + "md5": "9d869f5503c1b2606035ac0d512b666b", + "size (bytes)": 9461363, + "total_queries": 3452, + "downloaded": False }, } diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index b06964b5c..59697eaed 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -5159,7 +5159,20 @@ def add_lucene_index_info(enum, info, readme): "documents": 5183, "downloaded": False, "texts": "beir-v1.0.0-scifact.flat" - } + }, + "beir-v1.0.0-nq.arctic-embed-m-v1.5": { + "description": "Faiss flat index for BEIR (v1.0.0): NQ, encoded by Arctic embed-m-v1.5.", + "filename": "faiss-flat.beir-v1.0.0-nq.arctic-embed-m-v1.5.20240908.tar.gz", + "readme": "faiss-flat.beir-v1.0.0.arctic-embed-m-v1.5.20240908.README.md", + "urls": [ + "https://www.dropbox.com/scl/fi/abu8kn936rrxd85dk6sqy/faiss-flat.beir-v1.0.0-nq.arctic-embed-m-v1.5.20240908.tar.gz?dl=1&rlkey=9s7hy66d8en0nbv3ih6wjxh95&st=ooinw62c" + ], + "md5": "9a0dfded63e4554a002866ea57f4a30e", + "size compressed (bytes)": 7617697773, + "documents": 2681468, + "downloaded": False, + "texts": "beir-v1.0.0-nq.flat" + }, } FAISS_INDEX_INFO_MRTYDI = { diff --git a/pyserini/resources/index-metadata/faiss-flat.msmarco-v2.1-doc.arctic-embed-l.20240824.README.md b/pyserini/resources/index-metadata/faiss-flat.msmarco-v2.1-doc.arctic-embed-l.20240824.README.md new file mode 100644 index 000000000..5eb4add3d --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-flat.msmarco-v2.1-doc.arctic-embed-l.20240824.README.md @@ -0,0 +1,26 @@ +# msmarco-v2.1-arctic-embed-l + +Faiss FlatIP indexes of msmarco v2.1 encoded by Snowflake embed-l. These indexes were generated on 2024/08/26 on `orca`. + +The indexes were generated from indexing embeddings available on [Huggingface](https://huggingface.co/datasets/Snowflake/msmarco-v2.1-snowflake-arctic-embed-l). + +## Preparation +Due to msmarco v2.1 dataset's large size, indexes needed to be divided in two parts. + +```bash +python scripts/arctic/convert_embeddings.py --embeddings_folder /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/corpus \ +--output /store/scratch/sjupadhy/indexes/msmarco-v2.1-dev-snowflake-arctic-embed-l-1 \ +--indices 0_30 + +python scripts/arctic/convert_embeddings.py --embeddings_folder /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/corpus \ +--output /store/scratch/sjupadhy/indexes/msmarco-v2.1-dev-snowflake-arctic-embed-l-2 \ +--indices 30_59 +``` + +### Topic embeddings +```bash +python scripts/arctic/convert_queries.py --embedding_path /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/snowflake-arctic-embed-l-topics.msmarco-v2-doc.dev.parquet \ +--output /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l + +``` + diff --git a/pyserini/util.py b/pyserini/util.py index 1649b3585..80ec5ba9f 100644 --- a/pyserini/util.py +++ b/pyserini/util.py @@ -70,6 +70,7 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb if not local_filename: filename = url.split('/')[-1] filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter + filename = re.sub(r'\?.*$', '', filename) else: # Otherwise, use the specified local_filename: filename = local_filename diff --git a/scripts/arctic/convert_embeddings.py b/scripts/arctic/convert_embeddings.py new file mode 100644 index 000000000..fca3e3e41 --- /dev/null +++ b/scripts/arctic/convert_embeddings.py @@ -0,0 +1,74 @@ +"""Converts the hgf embeddings for documents into pyserini compatible format.""" + +import argparse +import os +import pandas as pd +import faiss +import numpy as np +from tqdm import tqdm + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--embeddings_folder", + type=str, + required=True, + help="Path to corpus embeddings folder downloaded from hgf.", + ) + parser.add_argument( + "--output", type=str, help="Path to store faiss IndexFlatIP.", required=True + ) + parser.add_argument( + "--indices", + type=str, + help="Start and end index seperated with _ or full.", + required=False, + default="full", + ) + parser.add_argument( + "--start_filter", + type=str, + help="Filter to be used with string start.", + required=False, + default="", + ) + + args = parser.parse_args() + + os.makedirs(args.output, exist_ok=True) + + folder_path = args.embeddings_folder + + files = [file for file in os.listdir(folder_path) if file.endswith(".parquet") and file.startswith(args.start_filter)] + + if args.indices == "full": + start = 0 + end = len(files) + else: + indices = args.indices.split("_") + start = int(indices[0]) + end = int(indices[1]) + + all_embeddings = [] + doc_ids = [] + for file_name in tqdm(files[start:end]): + file_path = os.path.join(folder_path, file_name) + df = pd.read_parquet(file_path) + embeddings = df["VECTOR_MAIN"].tolist() + embeddings = np.array(embeddings) + dim = embeddings[0].shape[0] + faiss.normalize_L2(embeddings) + all_embeddings.append(embeddings.reshape(-1, dim)) + doc_ids.extend(df["DOC_ID"].tolist()) + + combined_embeddings = np.vstack(all_embeddings) + + index = faiss.IndexFlatIP(combined_embeddings.shape[1]) + index.add(combined_embeddings) + faiss.write_index(index, os.path.join(args.output, "index")) + + file_path = os.path.join(args.output, "docid") + + with open(file_path, "w") as file: + for value in doc_ids: + file.write(f"{value}\n") diff --git a/scripts/arctic/convert_topics.py b/scripts/arctic/convert_topics.py new file mode 100644 index 000000000..50bc60774 --- /dev/null +++ b/scripts/arctic/convert_topics.py @@ -0,0 +1,47 @@ +"""Converts the hgf embeddings for topics into pyserini compatible format. + +python scripts/arctic/convert_queries.py --embedding_path /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/snowflake-arctic-embed-l-topics.msmarco-v2-doc.dev.parquet --output /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l +""" + +import argparse +import os + +import faiss +import numpy as np +import pandas as pd + +from pyserini.search import get_topics + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--embedding_path", + type=str, + required=True, + help="Path to corpus embeddings file for topics downloaded from hgf.", + ) + parser.add_argument( + "--output", type=str, help="Path to store embedding.pkl.", required=True + ) + parser.add_argument( + "--topic", type=str, help="Pyserini topic name.", required=True + ) + + args = parser.parse_args() + + os.makedirs(args.output, exist_ok=True) + + df = pd.read_parquet(args.embedding_path) + if "embedding" not in df.columns: + df.rename(columns={"VECTOR_MAIN": "embedding", "QUERY_ID": "id"}, inplace=True) + array_2d = np.vstack(df["embedding"].values) + faiss.normalize_L2(array_2d) + df["embedding"] = [array_2d[i, :] for i in range(array_2d.shape[0])] + + if "text" not in df.columns: + topics_mapping = get_topics(args.topic) + text_list = [topics_mapping.get(topic).get("title") for topic in df["id"].to_list()] + df["text"] = text_list + + df.to_pickle(os.path.join(args.output, "embedding.pkl")) diff --git a/scripts/arctic/merge_retrieved_results.py b/scripts/arctic/merge_retrieved_results.py new file mode 100644 index 000000000..9cde37a10 --- /dev/null +++ b/scripts/arctic/merge_retrieved_results.py @@ -0,0 +1,66 @@ +import argparse +import os +from typing import List + + +def merge_retrieved(shard_files: List[str], output_file: str, top_n: int) -> None: + merged_results = {} + for shard_file in shard_files: + print(f"Loading shard {shard_file} ") + with open(shard_file, "r") as f: + for line in f: + data = line.split() + if data[0] not in merged_results: + merged_results[data[0]] = [] + merged_results[data[0]].append((data[2], data[4])) + print("Shards all loaded, merging results and sorting by score") + run = {} + + for query_id, doc_scores in merged_results.items(): + doc_score_dict = {} + for passage_id, score in doc_scores: + doc_id = passage_id.split("#")[0] + if doc_id not in doc_score_dict: + doc_score_dict[doc_id] = ( + -1 + ) # scores are in range -1 to 1 on similairty so starting at -1 is floor + if float(score) > float(doc_score_dict[doc_id]): + doc_score_dict[doc_id] = score + top_docs = sorted(doc_score_dict.items(), key=lambda x: x[1], reverse=True)[ + :top_n + ] + run[query_id] = { + doc_id: round(float(score) * 100, 2) for doc_id, score in top_docs + } + results = [] + for qid in run: + for index, doc_id in enumerate(run[qid]): + results.append( + f"{qid} Q0 {doc_id} {index + 1} {run[qid][doc_id]} faiss-merged" + ) + + with open(output_file, "w") as f: + for line in results: + f.write(f"{line}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--arctic_run_folder", + type=str, + required=True, + help="Path to run files that needs to be combined.", + ) + parser.add_argument( + "--output_file", type=str, help="Path to docwise merged file", required=True + ) + parser.add_argument("--k", default=1000, help="k value", type=int) + args = parser.parse_args() + + files = [ + os.path.join(args.arctic_run_folder, file) + for file in os.listdir(args.arctic_run_folder) + ] + + merge_retrieved(files, args.output_file, args.k)