From 558a22799f758f515c8f2acc49b3edc26e683b80 Mon Sep 17 00:00:00 2001 From: Shivani Upadhyay Date: Mon, 26 Aug 2024 03:23:09 -0400 Subject: [PATCH 1/2] Adds scripts and initial docs for arctic embeddings --- docs/experiments-arctic.md | 36 ++++++++++ ...v2.1-doc.arctic-embed-l.20240824.README.md | 26 ++++++++ scripts/arctic/convert_embeddings.py | 66 +++++++++++++++++++ scripts/arctic/convert_topics.py | 33 ++++++++++ scripts/arctic/merge_retrieved_results.py | 66 +++++++++++++++++++ 5 files changed, 227 insertions(+) create mode 100644 docs/experiments-arctic.md create mode 100644 pyserini/resources/index-metadata/faiss-flat.msmarco-v2.1-doc.arctic-embed-l.20240824.README.md create mode 100644 scripts/arctic/convert_embeddings.py create mode 100644 scripts/arctic/convert_topics.py create mode 100644 scripts/arctic/merge_retrieved_results.py diff --git a/docs/experiments-arctic.md b/docs/experiments-arctic.md new file mode 100644 index 000000000..d51ef4b3a --- /dev/null +++ b/docs/experiments-arctic.md @@ -0,0 +1,36 @@ +# Pyserini: Reproducing Arctic Results + +## MS marco v2.1 doc +In order to handle msmarco v2.1 dataset's large size, we have the indexes divided in two partitions. Thus we need to perform retrieval runs for both of the indexes. + +```bash +python -m pyserini.search.faiss --index /store/scratch/sjupadhy/indexes/msmarco-v2.1-snowflake-arctic-embed-l-1 \ +--topics /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/topics.msmarco-v2-doc.dev.json \ +--encoded-queries /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l/ \ +--output run.msmarco-v2.1-doc.arctic-embed-l-1.dev.txt \ +--hits 2000 --threads 16 --batch-size 128 + + +python -m pyserini.search.faiss --index /store/scratch/sjupadhy/indexes/msmarco-v2.1-snowflake-arctic-embed-l-2 \ +--topics /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/topics.msmarco-v2-doc.dev.json \ +--encoded-queries /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l/ \ +--output run.msmarco-v2.1-doc.arctic-embed-l-2.dev.txt \ +--hits 2000 --threads 16 --batch-size 128 +``` + +## Marging and compiling docwise results +As the available embeddings refer to doc segments, we need to complile doc wise results. Thus we merge and compile them with: +```bash +python scripts/arctic/merge_retrieved_results.py --arctic_run_folder arctic_runs \ +--output_file run.msmarco-v2.1-doc.arctic-embed-l-merged.dev.txt \ +--k 1000 +``` + +## Evaluation +```bash +python -m pyserini.eval.trec_eval -c -m recall.1000 -m recall.100 -m ndcg_cut.10 /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/qrels/qrels_qrels.msmarco-v2.1-doc.dev.txt run.msmarco-v2.1-doc.arctic-embed-l-merged.dev.txt +Results: +recall_1000 all 0.9408 +recall_100 all 0.8513 +ndcg_cut_10 all 0.3583 +``` \ No newline at end of file diff --git a/pyserini/resources/index-metadata/faiss-flat.msmarco-v2.1-doc.arctic-embed-l.20240824.README.md b/pyserini/resources/index-metadata/faiss-flat.msmarco-v2.1-doc.arctic-embed-l.20240824.README.md new file mode 100644 index 000000000..5eb4add3d --- /dev/null +++ b/pyserini/resources/index-metadata/faiss-flat.msmarco-v2.1-doc.arctic-embed-l.20240824.README.md @@ -0,0 +1,26 @@ +# msmarco-v2.1-arctic-embed-l + +Faiss FlatIP indexes of msmarco v2.1 encoded by Snowflake embed-l. These indexes were generated on 2024/08/26 on `orca`. + +The indexes were generated from indexing embeddings available on [Huggingface](https://huggingface.co/datasets/Snowflake/msmarco-v2.1-snowflake-arctic-embed-l). + +## Preparation +Due to msmarco v2.1 dataset's large size, indexes needed to be divided in two parts. + +```bash +python scripts/arctic/convert_embeddings.py --embeddings_folder /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/corpus \ +--output /store/scratch/sjupadhy/indexes/msmarco-v2.1-dev-snowflake-arctic-embed-l-1 \ +--indices 0_30 + +python scripts/arctic/convert_embeddings.py --embeddings_folder /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/corpus \ +--output /store/scratch/sjupadhy/indexes/msmarco-v2.1-dev-snowflake-arctic-embed-l-2 \ +--indices 30_59 +``` + +### Topic embeddings +```bash +python scripts/arctic/convert_queries.py --embedding_path /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/snowflake-arctic-embed-l-topics.msmarco-v2-doc.dev.parquet \ +--output /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l + +``` + diff --git a/scripts/arctic/convert_embeddings.py b/scripts/arctic/convert_embeddings.py new file mode 100644 index 000000000..5612fb6f8 --- /dev/null +++ b/scripts/arctic/convert_embeddings.py @@ -0,0 +1,66 @@ +"""Converts the hgf embeddings for documents into pyserini compatible format. +""" + +import argparse +import os +import pandas as pd +import faiss +import numpy as np +from tqdm import tqdm + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--embeddings_folder", + type=str, + required=True, + help="Path to corpus embeddings folder downloaded from hgf.", + ) + parser.add_argument( + "--output", type=str, help="Path to store faiss IndexFlatIP.", required=True + ) + parser.add_argument( + "--indices", + type=str, + help="Start and end index seperated with _ or full.", + required=False, + default="full", + ) + + args = parser.parse_args() + + folder_path = args.embeddings_folder + + files = [file for file in os.listdir(folder_path) if file.endswith(".parquet")] + + if args.indices == "full": + start = 0 + end = len(files) + else: + indices = args.indices.split("_") + start = int(indices[0]) + end = int(indices[1]) + + all_embeddings = [] + doc_ids = [] + for file_name in tqdm(files[start:end]): + file_path = os.path.join(folder_path, file_name) + df = pd.read_parquet(file_path) + embeddings = df["embedding"].tolist() + embeddings = np.array(embeddings) + dim = embeddings[0].shape[0] + faiss.normalize_L2(embeddings) + all_embeddings.append(embeddings.reshape(-1, dim)) + doc_ids.extend(df["doc_id"].tolist()) + + combined_embeddings = np.vstack(all_embeddings) + + index = faiss.IndexFlatIP(combined_embeddings.shape[1]) + index.add(combined_embeddings) + faiss.write_index(index, os.path.join(args.output, "index")) + + file_path = os.path.join(args.output, "docid") + + with open(file_path, "w") as file: + for value in doc_ids: + file.write(f"{value}\n") diff --git a/scripts/arctic/convert_topics.py b/scripts/arctic/convert_topics.py new file mode 100644 index 000000000..1788d19ef --- /dev/null +++ b/scripts/arctic/convert_topics.py @@ -0,0 +1,33 @@ +"""Converts the hgf embeddings for topics into pyserini compatible format. + +python scripts/arctic/convert_queries.py --embedding_path /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/topics/snowflake-arctic-embed-l-topics.msmarco-v2-doc.dev.parquet --output /store/scratch/sjupadhy/queries/msmarco-v2.1-dev-snowflake-arctic-embed-l +""" + +import argparse +import os + +import faiss +import numpy as np +import pandas as pd + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--embedding_path", + type=str, + required=True, + help="Path to corpus embeddings file for topics downloaded from hgf.", + ) + parser.add_argument( + "--output", type=str, help="Path to store embedding.pkl.", required=True + ) + + args = parser.parse_args() + + df = pd.read_parquet(args.embedding_path) + array_2d = np.vstack(df["embedding"].values) + faiss.normalize_L2(array_2d) + df["embedding"] = [array_2d[i, :] for i in range(array_2d.shape[0])] + + df.to_pickle(os.path.join(args.output, "embedding.pkl")) diff --git a/scripts/arctic/merge_retrieved_results.py b/scripts/arctic/merge_retrieved_results.py new file mode 100644 index 000000000..9cde37a10 --- /dev/null +++ b/scripts/arctic/merge_retrieved_results.py @@ -0,0 +1,66 @@ +import argparse +import os +from typing import List + + +def merge_retrieved(shard_files: List[str], output_file: str, top_n: int) -> None: + merged_results = {} + for shard_file in shard_files: + print(f"Loading shard {shard_file} ") + with open(shard_file, "r") as f: + for line in f: + data = line.split() + if data[0] not in merged_results: + merged_results[data[0]] = [] + merged_results[data[0]].append((data[2], data[4])) + print("Shards all loaded, merging results and sorting by score") + run = {} + + for query_id, doc_scores in merged_results.items(): + doc_score_dict = {} + for passage_id, score in doc_scores: + doc_id = passage_id.split("#")[0] + if doc_id not in doc_score_dict: + doc_score_dict[doc_id] = ( + -1 + ) # scores are in range -1 to 1 on similairty so starting at -1 is floor + if float(score) > float(doc_score_dict[doc_id]): + doc_score_dict[doc_id] = score + top_docs = sorted(doc_score_dict.items(), key=lambda x: x[1], reverse=True)[ + :top_n + ] + run[query_id] = { + doc_id: round(float(score) * 100, 2) for doc_id, score in top_docs + } + results = [] + for qid in run: + for index, doc_id in enumerate(run[qid]): + results.append( + f"{qid} Q0 {doc_id} {index + 1} {run[qid][doc_id]} faiss-merged" + ) + + with open(output_file, "w") as f: + for line in results: + f.write(f"{line}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--arctic_run_folder", + type=str, + required=True, + help="Path to run files that needs to be combined.", + ) + parser.add_argument( + "--output_file", type=str, help="Path to docwise merged file", required=True + ) + parser.add_argument("--k", default=1000, help="k value", type=int) + args = parser.parse_args() + + files = [ + os.path.join(args.arctic_run_folder, file) + for file in os.listdir(args.arctic_run_folder) + ] + + merge_retrieved(files, args.output_file, args.k) From 51d39b14454bd9a18539761770f8c8d182e981f0 Mon Sep 17 00:00:00 2001 From: Shivani Upadhyay Date: Sun, 8 Sep 2024 23:00:24 -0400 Subject: [PATCH 2/2] Adds BEIR NQ arctic embeddings --- docs/experiments-arctic.md | 18 ++++++++++++++++-- pyserini/encoded_query_info.py | 10 ++++++++++ pyserini/prebuilt_index_info.py | 15 ++++++++++++++- pyserini/util.py | 1 + scripts/arctic/convert_embeddings.py | 18 +++++++++++++----- scripts/arctic/convert_topics.py | 14 ++++++++++++++ 6 files changed, 68 insertions(+), 8 deletions(-) diff --git a/docs/experiments-arctic.md b/docs/experiments-arctic.md index d51ef4b3a..1db71e011 100644 --- a/docs/experiments-arctic.md +++ b/docs/experiments-arctic.md @@ -18,7 +18,7 @@ python -m pyserini.search.faiss --index /store/scratch/sjupadhy/indexes/msmarco- --hits 2000 --threads 16 --batch-size 128 ``` -## Marging and compiling docwise results +### Merging and compiling docwise results As the available embeddings refer to doc segments, we need to complile doc wise results. Thus we merge and compile them with: ```bash python scripts/arctic/merge_retrieved_results.py --arctic_run_folder arctic_runs \ @@ -26,11 +26,25 @@ python scripts/arctic/merge_retrieved_results.py --arctic_run_folder arctic_runs --k 1000 ``` -## Evaluation +### Evaluation ```bash python -m pyserini.eval.trec_eval -c -m recall.1000 -m recall.100 -m ndcg_cut.10 /store/scratch/sjupadhy/msmarco-v2.1-snowflake-arctic-embed-l/qrels/qrels_qrels.msmarco-v2.1-doc.dev.txt run.msmarco-v2.1-doc.arctic-embed-l-merged.dev.txt Results: recall_1000 all 0.9408 recall_100 all 0.8513 ndcg_cut_10 all 0.3583 +``` + +## BEIR +Retrieval run on NQ subdataset: +```bash +python -m pyserini.search.faiss --threads 16 --batch-size 512 --index beir-v1.0.0-nq.arctic-embed-m-v1.5 --topics beir-v1.0.0-nq-test --encoded-queries snowflake-arctic-embed-m-v1.5-beir-v1.0.0-nq-test --output run.beir.arctic-embed.nq.txt --hits 1000 +``` + +### Evaluation +```bash +python -m pyserini.eval.trec_eval -c -m ndcg_cut.10 -m recall.1000 -m recall.100 beir-v1.0.0-nq-test run.beir.arctic-embed.nq.txt +Results: +recall_1000 all 0.9951 +ndcg_cut_10 all 0.6244 ``` \ No newline at end of file diff --git a/pyserini/encoded_query_info.py b/pyserini/encoded_query_info.py index e3dabcb06..5fd84e757 100644 --- a/pyserini/encoded_query_info.py +++ b/pyserini/encoded_query_info.py @@ -1034,5 +1034,15 @@ "size (bytes)": 135674, "total_queries": 49, "downloaded": False + }, + "snowflake-arctic-embed-m-v1.5-beir-v1.0.0-nq-test": { + "description": "BEIR v1.0.0 nq test queries encoded by Snowflake arctic-embed-m-v1.5.", + "urls": [ + "https://www.dropbox.com/scl/fi/i3du2elac5gxlvn595ioc/query-embedding-snowflake-arctic-embed-m-v1.5-beir-v1.0.0-nq-test-20240908.tar.gz?dl=1&rlkey=wkwfg8qpputewgixj7k7osxdv&st=zdlh9nrz" + ], + "md5": "9d869f5503c1b2606035ac0d512b666b", + "size (bytes)": 9461363, + "total_queries": 3452, + "downloaded": False }, } diff --git a/pyserini/prebuilt_index_info.py b/pyserini/prebuilt_index_info.py index d0ec0bf17..0cc8a1125 100644 --- a/pyserini/prebuilt_index_info.py +++ b/pyserini/prebuilt_index_info.py @@ -5062,7 +5062,20 @@ "documents": 5183, "downloaded": False, "texts": "beir-v1.0.0-scifact.flat" - } + }, + "beir-v1.0.0-nq.arctic-embed-m-v1.5": { + "description": "Faiss flat index for BEIR (v1.0.0): NQ, encoded by Arctic embed-m-v1.5.", + "filename": "faiss-flat.beir-v1.0.0-nq.arctic-embed-m-v1.5.20240908.tar.gz", + "readme": "faiss-flat.beir-v1.0.0.arctic-embed-m-v1.5.20240908.README.md", + "urls": [ + "https://www.dropbox.com/scl/fi/abu8kn936rrxd85dk6sqy/faiss-flat.beir-v1.0.0-nq.arctic-embed-m-v1.5.20240908.tar.gz?dl=1&rlkey=9s7hy66d8en0nbv3ih6wjxh95&st=ooinw62c" + ], + "md5": "9a0dfded63e4554a002866ea57f4a30e", + "size compressed (bytes)": 7617697773, + "documents": 2681468, + "downloaded": False, + "texts": "beir-v1.0.0-nq.flat" + }, } FAISS_INDEX_INFO_MRTYDI = { diff --git a/pyserini/util.py b/pyserini/util.py index 530915342..98fd24ee0 100644 --- a/pyserini/util.py +++ b/pyserini/util.py @@ -69,6 +69,7 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb if not local_filename: filename = url.split('/')[-1] filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter + filename = re.sub(r'\?.*$', '', filename) else: # Otherwise, use the specified local_filename: filename = local_filename diff --git a/scripts/arctic/convert_embeddings.py b/scripts/arctic/convert_embeddings.py index 5612fb6f8..fca3e3e41 100644 --- a/scripts/arctic/convert_embeddings.py +++ b/scripts/arctic/convert_embeddings.py @@ -1,5 +1,4 @@ -"""Converts the hgf embeddings for documents into pyserini compatible format. -""" +"""Converts the hgf embeddings for documents into pyserini compatible format.""" import argparse import os @@ -26,12 +25,21 @@ required=False, default="full", ) + parser.add_argument( + "--start_filter", + type=str, + help="Filter to be used with string start.", + required=False, + default="", + ) args = parser.parse_args() + os.makedirs(args.output, exist_ok=True) + folder_path = args.embeddings_folder - files = [file for file in os.listdir(folder_path) if file.endswith(".parquet")] + files = [file for file in os.listdir(folder_path) if file.endswith(".parquet") and file.startswith(args.start_filter)] if args.indices == "full": start = 0 @@ -46,12 +54,12 @@ for file_name in tqdm(files[start:end]): file_path = os.path.join(folder_path, file_name) df = pd.read_parquet(file_path) - embeddings = df["embedding"].tolist() + embeddings = df["VECTOR_MAIN"].tolist() embeddings = np.array(embeddings) dim = embeddings[0].shape[0] faiss.normalize_L2(embeddings) all_embeddings.append(embeddings.reshape(-1, dim)) - doc_ids.extend(df["doc_id"].tolist()) + doc_ids.extend(df["DOC_ID"].tolist()) combined_embeddings = np.vstack(all_embeddings) diff --git a/scripts/arctic/convert_topics.py b/scripts/arctic/convert_topics.py index 1788d19ef..50bc60774 100644 --- a/scripts/arctic/convert_topics.py +++ b/scripts/arctic/convert_topics.py @@ -10,6 +10,8 @@ import numpy as np import pandas as pd +from pyserini.search import get_topics + if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -22,12 +24,24 @@ parser.add_argument( "--output", type=str, help="Path to store embedding.pkl.", required=True ) + parser.add_argument( + "--topic", type=str, help="Pyserini topic name.", required=True + ) args = parser.parse_args() + os.makedirs(args.output, exist_ok=True) + df = pd.read_parquet(args.embedding_path) + if "embedding" not in df.columns: + df.rename(columns={"VECTOR_MAIN": "embedding", "QUERY_ID": "id"}, inplace=True) array_2d = np.vstack(df["embedding"].values) faiss.normalize_L2(array_2d) df["embedding"] = [array_2d[i, :] for i in range(array_2d.shape[0])] + if "text" not in df.columns: + topics_mapping = get_topics(args.topic) + text_list = [topics_mapping.get(topic).get("title") for topic in df["id"].to_list()] + df["text"] = text_list + df.to_pickle(os.path.join(args.output, "embedding.pkl"))