Skip to content

Commit

Permalink
Use shared/lib in build embeddings tool and run this tool as a module (
Browse files Browse the repository at this point in the history
…#4254)

This makes it possible to use shared libraries (more to come) and common
config processing in NL server.

With this, more GCS download functions could be removed.

Also remove autogen input support since no autogen descriptions exist
anymore.
  • Loading branch information
shifucun authored May 22, 2024
1 parent 45941fe commit c44216a
Show file tree
Hide file tree
Showing 18 changed files with 148 additions and 298 deletions.
3 changes: 1 addition & 2 deletions build/web_compose/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ COPY import/. /workspace/import/
COPY tools/nl/embeddings/. /workspace/tools/nl/embeddings/

# Download model and embeddings
WORKDIR /workspace/tools/nl/embeddings
RUN python build_custom_dc_embeddings.py --mode=download
RUN python -m tools.nl.embeddings.build_custom_dc_embeddings --mode=download

WORKDIR /workspace

Expand Down
6 changes: 2 additions & 4 deletions run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,8 @@ function run_py_test {

# Tests within tools/nl/embeddings
echo "Running tests within tools/nl/embeddings:"
cd tools/nl/embeddings
pip3 install -r requirements.txt
python3 -m pytest ./ -s
cd ../../..
pip3 install -r tools/nl/embeddings/requirements.txt -q
python3 -m pytest tools/nl/embeddings/ -s

pip3 install yapf==0.40.2 -q
if ! command -v isort &> /dev/null
Expand Down
21 changes: 11 additions & 10 deletions server/integration_tests/explore_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,16 +348,17 @@ def test_detection_bugs(self):
'What is the relationship between housing size and home prices in California'
])

def test_detection_reranking(self):
self.run_detection(
'detection_api_reranking',
[
# Without reranker the top SV is Median_Income_Person,
# With reranking the top SV is Count_Person_IncomeOf75000OrMoreUSDollar.
'population that is rich in california'
],
check_detection=True,
reranker='cross-encoder-mxbai-rerank-base-v1')
# TODO: renable when we solve the flaky issue
# def test_detection_reranking(self):
# self.run_detection(
# 'detection_api_reranking',
# [
# # Without reranker the top SV is Median_Income_Person,
# # With reranking the top SV is Count_Person_IncomeOf75000OrMoreUSDollar.
# 'population that is rich in california'
# ],
# check_detection=True,
# reranker='cross-encoder-mxbai-rerank-base-v1')

def test_fulfillment_basic(self):
req = {
Expand Down
7 changes: 4 additions & 3 deletions server/routes/admin/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Google LLC
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,7 +67,8 @@ def load_data():
# Build custom embeddings.
command2 = [
'python',
'build_custom_dc_embeddings.py',
'-m',
'tools.nl.embeddings.build_custom_dc_embeddings',
'--sv_sentences_csv_path',
f'{sentences_path}',
'--output_dir',
Expand All @@ -88,7 +89,7 @@ def load_data():
output = []
for command, stage, cwd, execute in [
(command1, 'import_data', 'import/simple', True),
(command2, 'create_embeddings', 'tools/nl/embeddings', load_nl),
(command2, 'create_embeddings', '.', load_nl),
(command3, 'load_data', '.', True),
(command4, 'load_embeddings', '.', load_nl)
]:
Expand Down
File renamed without changes.
5 changes: 2 additions & 3 deletions tools/nl/embeddings/build_custom_dc_embeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Custom DC embeddings can be built by running the `build_custom_dc_embeddings.py`

```bash
./run_custom.sh \
--sv_sentences_csv_path=testdata/custom_dc/input/dcids_sentences.csv \
--sv_sentences_csv_path=$PWD/testdata/custom_dc/input/dcids_sentences.csv \
--output_dir=/tmp
```

Expand All @@ -24,7 +24,7 @@ To use a different model version, specify the `--model-version` flag.
```bash
./run_custom.sh \
--model_version=ft_final_v20230717230459.all-MiniLM-L6-v2 \
--sv_sentences_csv_path=testdata/custom_dc/input/dcids_sentences.csv \
--sv_sentences_csv_path=$PWD/testdata/custom_dc/input/dcids_sentences.csv \
--output_dir=/tmp
```

Expand All @@ -46,4 +46,3 @@ To see help on flags, run:
```bash
./run_custom.sh --help
```

55 changes: 16 additions & 39 deletions tools/nl/embeddings/build_custom_dc_embeddings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023 Google LLC
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -11,27 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build embeddings for custom DCs."""

import os
import sys
"""Build embeddings for custom DC"""

from absl import app
from absl import flags
from file_util import create_file_handler
from file_util import FileHandler
from google.cloud import storage
import pandas as pd
import utils
from sentence_transformers import SentenceTransformer
import yaml

# Import gcs module from shared lib.
# Since this tool is run standalone from this directory,
# the shared lib directory needs to be appended to the sys path.
_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
_SHARED_LIB_DIR = os.path.join(_THIS_DIR, "..", "..", "..", "shared", "lib")
sys.path.append(_SHARED_LIB_DIR)
import gcs # type: ignore
from shared.lib import gcs
from tools.nl.embeddings import utils
from tools.nl.embeddings.file_util import create_file_handler
from tools.nl.embeddings.file_util import FileHandler

FLAGS = flags.FLAGS

Expand Down Expand Up @@ -69,15 +60,13 @@ class Mode:
def download(embeddings_yaml_path: str):
"""Downloads the default FT model and embeddings.
"""
ctx = _ctx_no_model()

default_ft_embeddings_info = utils.get_default_ft_embeddings_info()

# Download model.
model_info = default_ft_embeddings_info.model_config
print(f"Downloading default model: {model_info.name}")
local_model_path = utils.get_or_download_model_from_gcs(
ctx, model_info.info['gcs_folder'])
local_model_path = gcs.maybe_download(model_info.info['gcs_folder'],
use_anonymous_client=True)
print(f"Downloaded default model to: {local_model_path}")

# Download embeddings.
Expand All @@ -99,13 +88,14 @@ def download(embeddings_yaml_path: str):
def build(model_info: utils.ModelConfig, sv_sentences_csv_path: str,
output_dir: str):
print(f"Downloading model: {model_info.name}")
ctx = _download_model(model_info.info['gcs_folder'])

model_path = gcs.maybe_download(model_info.info['gcs_folder'])
model_obj = SentenceTransformer(model_path)
print(
f"Generating embeddings dataframe from SV sentences CSV: {sv_sentences_csv_path}"
)
sv_sentences_csv_handler = create_file_handler(sv_sentences_csv_path)
embeddings_df = _build_embeddings_dataframe(ctx, sv_sentences_csv_handler)
embeddings_df = _build_embeddings_dataframe(model_obj,
sv_sentences_csv_handler)

print("Validating embeddings.")
utils.validate_embeddings(embeddings_df, sv_sentences_csv_path)
Expand All @@ -129,14 +119,15 @@ def build(model_info: utils.ModelConfig, sv_sentences_csv_path: str,


def _build_embeddings_dataframe(
ctx: utils.Context, sv_sentences_csv_handler: FileHandler) -> pd.DataFrame:
model: SentenceTransformer,
sv_sentences_csv_handler: FileHandler) -> pd.DataFrame:
sv_sentences_df = pd.read_csv(sv_sentences_csv_handler.read_string_io())

# Dedupe texts
(text2sv_dict, _) = utils.dedup_texts(sv_sentences_df)

print("Building custom DC embeddings")
return utils.build_embeddings(ctx, text2sv_dict)
return utils.build_embeddings(text2sv_dict, model=model)


def generate_embeddings_yaml(model_info: utils.ModelConfig,
Expand All @@ -163,20 +154,6 @@ def generate_embeddings_yaml(model_info: utils.ModelConfig,
embeddings_yaml_handler.write_string(yaml.dump(data))


def _download_model(model_version: str) -> utils.Context:
ctx_no_model = _ctx_no_model()
model = utils.get_ft_model_from_gcs(ctx_no_model, model_version)
return utils.Context(model=model,
model_endpoint=None,
bucket=ctx_no_model.bucket)


def _ctx_no_model() -> utils.Context:
bucket = storage.Client.create_anonymous_client().bucket(
utils.DEFAULT_MODELS_BUCKET)
return utils.Context(model=None, model_endpoint=None, bucket=bucket)


def main(_):
if FLAGS.mode == Mode.DOWNLOAD:
download(FLAGS.embeddings_yaml_path)
Expand Down
33 changes: 16 additions & 17 deletions tools/nl/embeddings/build_custom_dc_embeddings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,24 @@
# limitations under the License.

import os
from pathlib import Path
import tempfile
import unittest

from build_custom_dc_embeddings import EMBEDDINGS_CSV_FILENAME_PREFIX
from build_custom_dc_embeddings import EMBEDDINGS_YAML_FILE_NAME
import build_custom_dc_embeddings as builder
from file_util import create_file_handler
from sentence_transformers import SentenceTransformer
import utils

from tools.nl.embeddings import utils
from tools.nl.embeddings.build_custom_dc_embeddings import \
EMBEDDINGS_CSV_FILENAME_PREFIX
from tools.nl.embeddings.build_custom_dc_embeddings import \
EMBEDDINGS_YAML_FILE_NAME
import tools.nl.embeddings.build_custom_dc_embeddings as builder
from tools.nl.embeddings.file_util import create_file_handler

MODEL_NAME = "all-MiniLM-L6-v2"
INPUT_DIR = "testdata/custom_dc/input"
EXPECTED_DIR = "testdata/custom_dc/expected"

INPUT_DIR = Path(__file__).parent / "testdata/custom_dc/input"
EXPECTED_DIR = Path(__file__).parent / "testdata/custom_dc/expected"


def _compare_files(test: unittest.TestCase, output_path, expected_path):
Expand All @@ -41,10 +46,7 @@ class TestEndToEnd(unittest.TestCase):
def test_build_embeddings_dataframe(self):
self.maxDiff = None

ctx = utils.Context(model=SentenceTransformer(MODEL_NAME),
model_endpoint=None,
bucket=None,
tmp="/tmp")
model = SentenceTransformer(MODEL_NAME)

input_dcids_sentences_csv_path = os.path.join(INPUT_DIR,
"dcids_sentences.csv")
Expand All @@ -56,7 +58,7 @@ def test_build_embeddings_dataframe(self):
temp_dir, "final_dcids_sentences.csv")

embeddings_df = builder._build_embeddings_dataframe(
ctx, create_file_handler(input_dcids_sentences_csv_path))
model, create_file_handler(input_dcids_sentences_csv_path))

embeddings_df[['dcid',
'sentence']].to_csv(actual_dcids_sentences_csv_path,
Expand All @@ -66,16 +68,13 @@ def test_build_embeddings_dataframe(self):
expected_dcids_sentences_csv_path)

def test_build_embeddings_dataframe_and_validate(self):
ctx = utils.Context(model=SentenceTransformer(MODEL_NAME),
model_endpoint=None,
bucket=None,
tmp="/tmp")
model = SentenceTransformer(MODEL_NAME)

input_dcids_sentences_csv_path = os.path.join(INPUT_DIR,
"dcids_sentences.csv")

embeddings_df = builder._build_embeddings_dataframe(
ctx, create_file_handler(input_dcids_sentences_csv_path))
model, create_file_handler(input_dcids_sentences_csv_path))

# Test success == no failures during validation
utils.validate_embeddings(embeddings_df, input_dcids_sentences_csv_path)
Expand Down
Loading

0 comments on commit c44216a

Please sign in to comment.