datacommonsorg · shifucun · May 22, 2024 · May 21, 2024 · May 21, 2024 · May 21, 2024
diff --git a/run_test.sh b/run_test.sh
@@ -106,10 +106,8 @@ function run_py_test {
 
   # Tests within tools/nl/embeddings
   echo "Running tests within tools/nl/embeddings:"
-  cd tools/nl/embeddings
-  pip3 install -r requirements.txt
-  python3 -m pytest ./ -s
-  cd ../../..
+  pip3 install -r tools/nl/embeddings/requirements.txt -q
+  python3 -m pytest tools/nl/embeddings/ -s
 
   pip3 install yapf==0.40.2 -q
   if ! command -v isort &> /dev/null

diff --git a/server/integration_tests/explore_test.py b/server/integration_tests/explore_test.py
@@ -348,16 +348,17 @@ def test_detection_bugs(self):
         'What is the relationship between housing size and home prices in California'
     ])
 
-  def test_detection_reranking(self):
-    self.run_detection(
-        'detection_api_reranking',
-        [
-            # Without reranker the top SV is Median_Income_Person,
-            # With reranking the top SV is Count_Person_IncomeOf75000OrMoreUSDollar.
-            'population that is rich in california'
-        ],
-        check_detection=True,
-        reranker='cross-encoder-mxbai-rerank-base-v1')
+  # TODO: renable when we solve the flaky issue
+  # def test_detection_reranking(self):
+  #   self.run_detection(
+  #       'detection_api_reranking',
+  #       [
+  #           # Without reranker the top SV is Median_Income_Person,
+  #           # With reranking the top SV is Count_Person_IncomeOf75000OrMoreUSDollar.
+  #           'population that is rich in california'
+  #       ],
+  #       check_detection=True,
+  #       reranker='cross-encoder-mxbai-rerank-base-v1')
 
   def test_fulfillment_basic(self):
     req = {

diff --git a/server/routes/admin/html.py b/server/routes/admin/html.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -67,7 +67,8 @@ def load_data():
   # Build custom embeddings.
   command2 = [
       'python',
-      'build_custom_dc_embeddings.py',
+      '-m',
+      'tools.nl.embeddings.build_custom_dc_embeddings',
       '--sv_sentences_csv_path',
       f'{sentences_path}',
       '--output_dir',
@@ -88,7 +89,7 @@ def load_data():
   output = []
   for command, stage, cwd, execute in [
       (command1, 'import_data', 'import/simple', True),
-      (command2, 'create_embeddings', 'tools/nl/embeddings', load_nl),
+      (command2, 'create_embeddings', '.', load_nl),
       (command3, 'load_data', '.', True),
       (command4, 'load_embeddings', '.', load_nl)
   ]:

diff --git a/shared/lib/gcs_test.py → shared/tests/lib/gcs_test.py b/shared/lib/gcs_test.py → shared/tests/lib/gcs_test.py
diff --git a/tools/nl/embeddings/build_custom_dc_embeddings.md b/tools/nl/embeddings/build_custom_dc_embeddings.md
@@ -6,7 +6,7 @@ Custom DC embeddings can be built by running the `build_custom_dc_embeddings.py`
 
 ```bash
 ./run_custom.sh \
---sv_sentences_csv_path=testdata/custom_dc/input/dcids_sentences.csv \
+--sv_sentences_csv_path=$PWD/testdata/custom_dc/input/dcids_sentences.csv \
 --output_dir=/tmp
 ```
 
@@ -24,7 +24,7 @@ To use a different model version, specify the `--model-version` flag.
 ```bash
 ./run_custom.sh \
 --model_version=ft_final_v20230717230459.all-MiniLM-L6-v2 \
---sv_sentences_csv_path=testdata/custom_dc/input/dcids_sentences.csv \
+--sv_sentences_csv_path=$PWD/testdata/custom_dc/input/dcids_sentences.csv \
 --output_dir=/tmp
 ```
 
@@ -46,4 +46,3 @@ To see help on flags, run:
 ```bash
 ./run_custom.sh --help
 ```
-
diff --git a/tools/nl/embeddings/build_custom_dc_embeddings.py b/tools/nl/embeddings/build_custom_dc_embeddings.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,27 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Build embeddings for custom DCs."""
-
-import os
-import sys
+"""Build embeddings for custom DC"""
 
 from absl import app
 from absl import flags
-from file_util import create_file_handler
-from file_util import FileHandler
-from google.cloud import storage
 import pandas as pd
-import utils
+from sentence_transformers import SentenceTransformer
 import yaml
 
-# Import gcs module from shared lib.
-# Since this tool is run standalone from this directory,
-# the shared lib directory needs to be appended to the sys path.
-_THIS_DIR = os.path.dirname(os.path.abspath(__file__))
-_SHARED_LIB_DIR = os.path.join(_THIS_DIR, "..", "..", "..", "shared", "lib")
-sys.path.append(_SHARED_LIB_DIR)
-import gcs  # type: ignore
+from shared.lib import gcs
+from tools.nl.embeddings import utils
+from tools.nl.embeddings.file_util import create_file_handler
+from tools.nl.embeddings.file_util import FileHandler
 
 FLAGS = flags.FLAGS
 
@@ -69,15 +60,12 @@ class Mode:
 def download(embeddings_yaml_path: str):
   """Downloads the default FT model and embeddings.
   """
-  ctx = _ctx_no_model()
-
   default_ft_embeddings_info = utils.get_default_ft_embeddings_info()
 
   # Download model.
   model_info = default_ft_embeddings_info.model_config
   print(f"Downloading default model: {model_info.name}")
-  local_model_path = utils.get_or_download_model_from_gcs(
-      ctx, model_info.info['gcs_folder'])
+  local_model_path = gcs.maybe_download(model_info.info['gcs_folder'])
   print(f"Downloaded default model to: {local_model_path}")
 
   # Download embeddings.
@@ -99,13 +87,14 @@ def download(embeddings_yaml_path: str):
 def build(model_info: utils.ModelConfig, sv_sentences_csv_path: str,
           output_dir: str):
   print(f"Downloading model: {model_info.name}")
-  ctx = _download_model(model_info.info['gcs_folder'])
-
+  model_path = gcs.maybe_download(model_info.info['gcs_folder'])
+  model_obj = SentenceTransformer(model_path)
   print(
       f"Generating embeddings dataframe from SV sentences CSV: {sv_sentences_csv_path}"
   )
   sv_sentences_csv_handler = create_file_handler(sv_sentences_csv_path)
-  embeddings_df = _build_embeddings_dataframe(ctx, sv_sentences_csv_handler)
+  embeddings_df = _build_embeddings_dataframe(model_obj,
+                                              sv_sentences_csv_handler)
 
   print("Validating embeddings.")
   utils.validate_embeddings(embeddings_df, sv_sentences_csv_path)
@@ -129,14 +118,15 @@ def build(model_info: utils.ModelConfig, sv_sentences_csv_path: str,
 
 
 def _build_embeddings_dataframe(
-    ctx: utils.Context, sv_sentences_csv_handler: FileHandler) -> pd.DataFrame:
+    model: SentenceTransformer,
+    sv_sentences_csv_handler: FileHandler) -> pd.DataFrame:
   sv_sentences_df = pd.read_csv(sv_sentences_csv_handler.read_string_io())
 
   # Dedupe texts
   (text2sv_dict, _) = utils.dedup_texts(sv_sentences_df)
 
   print("Building custom DC embeddings")
-  return utils.build_embeddings(ctx, text2sv_dict)
+  return utils.build_embeddings(text2sv_dict, model=model)
 
 
 def generate_embeddings_yaml(model_info: utils.ModelConfig,
@@ -163,20 +153,6 @@ def generate_embeddings_yaml(model_info: utils.ModelConfig,
   embeddings_yaml_handler.write_string(yaml.dump(data))
 
 
-def _download_model(model_version: str) -> utils.Context:
-  ctx_no_model = _ctx_no_model()
-  model = utils.get_ft_model_from_gcs(ctx_no_model, model_version)
-  return utils.Context(model=model,
-                       model_endpoint=None,
-                       bucket=ctx_no_model.bucket)
-
-
-def _ctx_no_model() -> utils.Context:
-  bucket = storage.Client.create_anonymous_client().bucket(
-      utils.DEFAULT_MODELS_BUCKET)
-  return utils.Context(model=None, model_endpoint=None, bucket=bucket)
-
-
 def main(_):
   if FLAGS.mode == Mode.DOWNLOAD:
     download(FLAGS.embeddings_yaml_path)

diff --git a/tools/nl/embeddings/build_custom_dc_embeddings_test.py b/tools/nl/embeddings/build_custom_dc_embeddings_test.py
@@ -13,19 +13,24 @@
 # limitations under the License.
 
 import os
+from pathlib import Path
 import tempfile
 import unittest
 
-from build_custom_dc_embeddings import EMBEDDINGS_CSV_FILENAME_PREFIX
-from build_custom_dc_embeddings import EMBEDDINGS_YAML_FILE_NAME
-import build_custom_dc_embeddings as builder
-from file_util import create_file_handler
 from sentence_transformers import SentenceTransformer
-import utils
+
+from tools.nl.embeddings import utils
+from tools.nl.embeddings.build_custom_dc_embeddings import \
+    EMBEDDINGS_CSV_FILENAME_PREFIX
+from tools.nl.embeddings.build_custom_dc_embeddings import \
+    EMBEDDINGS_YAML_FILE_NAME
+import tools.nl.embeddings.build_custom_dc_embeddings as builder
+from tools.nl.embeddings.file_util import create_file_handler
 
 MODEL_NAME = "all-MiniLM-L6-v2"
-INPUT_DIR = "testdata/custom_dc/input"
-EXPECTED_DIR = "testdata/custom_dc/expected"
+
+INPUT_DIR = Path(__file__).parent / "testdata/custom_dc/input"
+EXPECTED_DIR = Path(__file__).parent / "testdata/custom_dc/expected"
 
 
 def _compare_files(test: unittest.TestCase, output_path, expected_path):
@@ -41,10 +46,7 @@ class TestEndToEnd(unittest.TestCase):
   def test_build_embeddings_dataframe(self):
     self.maxDiff = None
 
-    ctx = utils.Context(model=SentenceTransformer(MODEL_NAME),
-                        model_endpoint=None,
-                        bucket=None,
-                        tmp="/tmp")
+    model = SentenceTransformer(MODEL_NAME)
 
     input_dcids_sentences_csv_path = os.path.join(INPUT_DIR,
                                                   "dcids_sentences.csv")
@@ -56,7 +58,7 @@ def test_build_embeddings_dataframe(self):
           temp_dir, "final_dcids_sentences.csv")
 
       embeddings_df = builder._build_embeddings_dataframe(
-          ctx, create_file_handler(input_dcids_sentences_csv_path))
+          model, create_file_handler(input_dcids_sentences_csv_path))
 
       embeddings_df[['dcid',
                      'sentence']].to_csv(actual_dcids_sentences_csv_path,
@@ -66,16 +68,13 @@ def test_build_embeddings_dataframe(self):
                      expected_dcids_sentences_csv_path)
 
   def test_build_embeddings_dataframe_and_validate(self):
-    ctx = utils.Context(model=SentenceTransformer(MODEL_NAME),
-                        model_endpoint=None,
-                        bucket=None,
-                        tmp="/tmp")
+    model = SentenceTransformer(MODEL_NAME)
 
     input_dcids_sentences_csv_path = os.path.join(INPUT_DIR,
                                                   "dcids_sentences.csv")
 
     embeddings_df = builder._build_embeddings_dataframe(
-        ctx, create_file_handler(input_dcids_sentences_csv_path))
+        model, create_file_handler(input_dcids_sentences_csv_path))
 
     # Test success == no failures during validation
     utils.validate_embeddings(embeddings_df, input_dcids_sentences_csv_path)