Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use shared/lib in build embeddings tool and run this tool as a module #4254

Merged
merged 8 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,8 @@ function run_py_test {

# Tests within tools/nl/embeddings
echo "Running tests within tools/nl/embeddings:"
cd tools/nl/embeddings
pip3 install -r requirements.txt
python3 -m pytest ./ -s
cd ../../..
pip3 install -r tools/nl/embeddings/requirements.txt -q
python3 -m pytest tools/nl/embeddings/ -s

pip3 install yapf==0.40.2 -q
if ! command -v isort &> /dev/null
Expand Down
21 changes: 11 additions & 10 deletions server/integration_tests/explore_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,16 +348,17 @@ def test_detection_bugs(self):
'What is the relationship between housing size and home prices in California'
])

def test_detection_reranking(self):
self.run_detection(
'detection_api_reranking',
[
# Without reranker the top SV is Median_Income_Person,
# With reranking the top SV is Count_Person_IncomeOf75000OrMoreUSDollar.
'population that is rich in california'
],
check_detection=True,
reranker='cross-encoder-mxbai-rerank-base-v1')
# TODO: renable when we solve the flaky issue
# def test_detection_reranking(self):
# self.run_detection(
# 'detection_api_reranking',
# [
# # Without reranker the top SV is Median_Income_Person,
# # With reranking the top SV is Count_Person_IncomeOf75000OrMoreUSDollar.
# 'population that is rich in california'
# ],
# check_detection=True,
# reranker='cross-encoder-mxbai-rerank-base-v1')

def test_fulfillment_basic(self):
req = {
Expand Down
7 changes: 4 additions & 3 deletions server/routes/admin/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Google LLC
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,7 +67,8 @@ def load_data():
# Build custom embeddings.
command2 = [
'python',
'build_custom_dc_embeddings.py',
'-m',
'tools.nl.embeddings.build_custom_dc_embeddings',
'--sv_sentences_csv_path',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Dockerfile will also need to be updated and validated that it builds correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated and verified with docker build locally.

f'{sentences_path}',
'--output_dir',
Expand All @@ -88,7 +89,7 @@ def load_data():
output = []
for command, stage, cwd, execute in [
(command1, 'import_data', 'import/simple', True),
(command2, 'create_embeddings', 'tools/nl/embeddings', load_nl),
(command2, 'create_embeddings', '.', load_nl),
(command3, 'load_data', '.', True),
(command4, 'load_embeddings', '.', load_nl)
]:
Expand Down
File renamed without changes.
5 changes: 2 additions & 3 deletions tools/nl/embeddings/build_custom_dc_embeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Custom DC embeddings can be built by running the `build_custom_dc_embeddings.py`

```bash
./run_custom.sh \
--sv_sentences_csv_path=testdata/custom_dc/input/dcids_sentences.csv \
--sv_sentences_csv_path=$PWD/testdata/custom_dc/input/dcids_sentences.csv \
--output_dir=/tmp
```

Expand All @@ -24,7 +24,7 @@ To use a different model version, specify the `--model-version` flag.
```bash
./run_custom.sh \
--model_version=ft_final_v20230717230459.all-MiniLM-L6-v2 \
--sv_sentences_csv_path=testdata/custom_dc/input/dcids_sentences.csv \
--sv_sentences_csv_path=$PWD/testdata/custom_dc/input/dcids_sentences.csv \
--output_dir=/tmp
```

Expand All @@ -46,4 +46,3 @@ To see help on flags, run:
```bash
./run_custom.sh --help
```

54 changes: 15 additions & 39 deletions tools/nl/embeddings/build_custom_dc_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Google LLC
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,27 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build embeddings for custom DCs."""

import os
import sys
"""Build embeddings for custom DC"""

from absl import app
from absl import flags
from file_util import create_file_handler
from file_util import FileHandler
from google.cloud import storage
import pandas as pd
import utils
from sentence_transformers import SentenceTransformer
import yaml

# Import gcs module from shared lib.
# Since this tool is run standalone from this directory,
# the shared lib directory needs to be appended to the sys path.
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_SHARED_LIB_DIR = os.path.join(_THIS_DIR, "..", "..", "..", "shared", "lib")
sys.path.append(_SHARED_LIB_DIR)
import gcs # type: ignore
from shared.lib import gcs
from tools.nl.embeddings import utils
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is so much better!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indeed!

from tools.nl.embeddings.file_util import create_file_handler
from tools.nl.embeddings.file_util import FileHandler

FLAGS = flags.FLAGS

Expand Down Expand Up @@ -69,15 +60,12 @@ class Mode:
def download(embeddings_yaml_path: str):
"""Downloads the default FT model and embeddings.
"""
ctx = _ctx_no_model()

default_ft_embeddings_info = utils.get_default_ft_embeddings_info()

# Download model.
model_info = default_ft_embeddings_info.model_config
print(f"Downloading default model: {model_info.name}")
local_model_path = utils.get_or_download_model_from_gcs(
ctx, model_info.info['gcs_folder'])
local_model_path = gcs.maybe_download(model_info.info['gcs_folder'])
print(f"Downloaded default model to: {local_model_path}")

# Download embeddings.
Expand All @@ -99,13 +87,14 @@ def download(embeddings_yaml_path: str):
def build(model_info: utils.ModelConfig, sv_sentences_csv_path: str,
output_dir: str):
print(f"Downloading model: {model_info.name}")
ctx = _download_model(model_info.info['gcs_folder'])

model_path = gcs.maybe_download(model_info.info['gcs_folder'])
model_obj = SentenceTransformer(model_path)
print(
f"Generating embeddings dataframe from SV sentences CSV: {sv_sentences_csv_path}"
)
sv_sentences_csv_handler = create_file_handler(sv_sentences_csv_path)
embeddings_df = _build_embeddings_dataframe(ctx, sv_sentences_csv_handler)
embeddings_df = _build_embeddings_dataframe(model_obj,
sv_sentences_csv_handler)

print("Validating embeddings.")
utils.validate_embeddings(embeddings_df, sv_sentences_csv_path)
Expand All @@ -129,14 +118,15 @@ def build(model_info: utils.ModelConfig, sv_sentences_csv_path: str,


def _build_embeddings_dataframe(
ctx: utils.Context, sv_sentences_csv_handler: FileHandler) -> pd.DataFrame:
model: SentenceTransformer,
sv_sentences_csv_handler: FileHandler) -> pd.DataFrame:
sv_sentences_df = pd.read_csv(sv_sentences_csv_handler.read_string_io())

# Dedupe texts
(text2sv_dict, _) = utils.dedup_texts(sv_sentences_df)

print("Building custom DC embeddings")
return utils.build_embeddings(ctx, text2sv_dict)
return utils.build_embeddings(text2sv_dict, model=model)


def generate_embeddings_yaml(model_info: utils.ModelConfig,
Expand All @@ -163,20 +153,6 @@ def generate_embeddings_yaml(model_info: utils.ModelConfig,
embeddings_yaml_handler.write_string(yaml.dump(data))


def _download_model(model_version: str) -> utils.Context:
ctx_no_model = _ctx_no_model()
model = utils.get_ft_model_from_gcs(ctx_no_model, model_version)
return utils.Context(model=model,
model_endpoint=None,
bucket=ctx_no_model.bucket)


def _ctx_no_model() -> utils.Context:
bucket = storage.Client.create_anonymous_client().bucket(
utils.DEFAULT_MODELS_BUCKET)
return utils.Context(model=None, model_endpoint=None, bucket=bucket)


def main(_):
if FLAGS.mode == Mode.DOWNLOAD:
download(FLAGS.embeddings_yaml_path)
Expand Down
33 changes: 16 additions & 17 deletions tools/nl/embeddings/build_custom_dc_embeddings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,24 @@
# limitations under the License.

import os
from pathlib import Path
import tempfile
import unittest

from build_custom_dc_embeddings import EMBEDDINGS_CSV_FILENAME_PREFIX
from build_custom_dc_embeddings import EMBEDDINGS_YAML_FILE_NAME
import build_custom_dc_embeddings as builder
from file_util import create_file_handler
from sentence_transformers import SentenceTransformer
import utils

from tools.nl.embeddings import utils
from tools.nl.embeddings.build_custom_dc_embeddings import \
EMBEDDINGS_CSV_FILENAME_PREFIX
from tools.nl.embeddings.build_custom_dc_embeddings import \
EMBEDDINGS_YAML_FILE_NAME
import tools.nl.embeddings.build_custom_dc_embeddings as builder
from tools.nl.embeddings.file_util import create_file_handler

MODEL_NAME = "all-MiniLM-L6-v2"
INPUT_DIR = "testdata/custom_dc/input"
EXPECTED_DIR = "testdata/custom_dc/expected"

INPUT_DIR = Path(__file__).parent / "testdata/custom_dc/input"
EXPECTED_DIR = Path(__file__).parent / "testdata/custom_dc/expected"


def _compare_files(test: unittest.TestCase, output_path, expected_path):
Expand All @@ -41,10 +46,7 @@ class TestEndToEnd(unittest.TestCase):
def test_build_embeddings_dataframe(self):
self.maxDiff = None

ctx = utils.Context(model=SentenceTransformer(MODEL_NAME),
model_endpoint=None,
bucket=None,
tmp="/tmp")
model = SentenceTransformer(MODEL_NAME)

input_dcids_sentences_csv_path = os.path.join(INPUT_DIR,
"dcids_sentences.csv")
Expand All @@ -56,7 +58,7 @@ def test_build_embeddings_dataframe(self):
temp_dir, "final_dcids_sentences.csv")

embeddings_df = builder._build_embeddings_dataframe(
ctx, create_file_handler(input_dcids_sentences_csv_path))
model, create_file_handler(input_dcids_sentences_csv_path))

embeddings_df[['dcid',
'sentence']].to_csv(actual_dcids_sentences_csv_path,
Expand All @@ -66,16 +68,13 @@ def test_build_embeddings_dataframe(self):
expected_dcids_sentences_csv_path)

def test_build_embeddings_dataframe_and_validate(self):
ctx = utils.Context(model=SentenceTransformer(MODEL_NAME),
model_endpoint=None,
bucket=None,
tmp="/tmp")
model = SentenceTransformer(MODEL_NAME)

input_dcids_sentences_csv_path = os.path.join(INPUT_DIR,
"dcids_sentences.csv")

embeddings_df = builder._build_embeddings_dataframe(
ctx, create_file_handler(input_dcids_sentences_csv_path))
model, create_file_handler(input_dcids_sentences_csv_path))

# Test success == no failures during validation
utils.validate_embeddings(embeddings_df, input_dcids_sentences_csv_path)
Expand Down
Loading
Loading