Major Index Update: change default index to base_uae_mem (#4241)

This is a major release of the NL feature to replace the all-mini-lm-l6 model with uae-large-v1 that is hosted on vertex AI. Also made fundamental improvements to stat var descriptions, and only use one accurate description per stat var. This get rid of the need to use alternatives. This adds some small debug info UI improvement.
datacommonsorg · May 17, 2024 · c993fdb · c993fdb
1 parent 08a20fd
commit c993fdb
Show file tree

Hide file tree

Showing 118 changed files with 24,031 additions and 15,538 deletions.
diff --git a/deploy/helm_charts/envs/autopush.yaml b/deploy/helm_charts/envs/autopush.yaml
@@ -45,7 +45,7 @@ serviceAccount:
 nl:
   enabled: true
   embeddingsSpec:
-    defaultIndex: "medium_ft"
+    defaultIndex: "base_uae_mem"
     enabledIndexes:
       [
         "base_uae_mem",

diff --git a/deploy/helm_charts/envs/dev.yaml b/deploy/helm_charts/envs/dev.yaml
@@ -21,7 +21,7 @@ namespace:
 
 website:
   flaskEnv: dev
-  replicas: 5
+  replicas: 10
   redis:
     enabled: false
 
@@ -33,7 +33,7 @@ serviceGroups:
   svg:
     replicas: 2
   observation:
-    replicas: 5
+    replicas: 10
   node:
     replicas: 10
   default:
@@ -42,7 +42,7 @@ serviceGroups:
 nl:
   enabled: true
   embeddingsSpec:
-    defaultIndex: "medium_ft"
+    defaultIndex: "base_uae_mem"
     enabledIndexes: [
       "base_uae_mem",
       "bio_ft",
@@ -52,6 +52,7 @@ nl:
       "medium_vertex_mistral",
       "sdg_ft",
       "undata_ft",
+      "undata_ilo_ft",
     ]
     vertexAIModels:
       dc-all-minilm-l6-v2-model:
@@ -61,7 +62,7 @@ nl:
       uae-large-v1-model:
         project_id: datcom-nl
         location: us-central1
-        prediction_endpoint_id: "8110162693219942400"
+        prediction_endpoint_id: "1400502935879680000"
       sfr-embedding-mistral-model:
         project_id: datcom-website-dev
         location: us-central1

diff --git a/deploy/nl/embeddings.yaml b/deploy/nl/embeddings.yaml
@@ -49,7 +49,7 @@ indexes:
     model: dc-all-minilm-l6-v2-model
   base_uae_mem:
     store: MEMORY
-    embeddings: gs://datcom-nl-models/embeddings_medium_2024_05_14_17_00_30.4910007712298827776.csv
+    embeddings: gs://datcom-nl-models/embeddings_medium_2024_05_16_13_45_32.8110162693219942400.csv
     model: uae-large-v1-model
   medium_vertex_mistral:
     store: VERTEXAI

diff --git a/nl_server/loader.py b/nl_server/loader.py
@@ -117,7 +117,6 @@ def _load_yaml(flask_env: str, enabled_indexes: List[str]) -> Dict[str, any]:
     embeddings_map['indexes'].update(custom_map.get('indexes', {}))
     embeddings_map['models'].update(custom_map.get('models', {}))
 
-  logging.info(f'Attempting to load NL YAML: {embeddings_map}')
   return embeddings_map
 
 

diff --git a/nl_server/routes.py b/nl_server/routes.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import json
-from typing import Dict, List
+from typing import List
 
 from flask import Blueprint
 from flask import current_app
@@ -79,11 +79,9 @@ def search_vars():
       reranker_name) if reranker_name else None
 
   nl_embeddings = _get_indexes(emb_map, idx)
-  debug_logs = {}
-  results: Dict[str,
-                VarCandidates] = search.search_vars(nl_embeddings, queries,
-                                                    skip_topics, reranker_model,
-                                                    debug_logs)
+  debug_logs = {'sv_detection_query_index_type': idx}
+  results = search.search_vars(nl_embeddings, queries, skip_topics,
+                               reranker_model, debug_logs)
   q2result = {q: var_candidates_to_dict(result) for q, result in results.items()}
   return json.dumps({
       'queryResults': q2result,

diff --git a/nl_server/tests/embeddings_test.py b/nl_server/tests/embeddings_test.py
@@ -20,11 +20,9 @@
 from parameterized import parameterized
 import yaml
 
+from nl_server import embeddings_map as emb_map
 from nl_server.config import parse
-from nl_server.embeddings import Embeddings
-from nl_server.model.sentence_transformer import LocalSentenceTransformerModel
 from nl_server.search import search_vars
-from nl_server.store.memory import MemoryEmbeddingsStore
 from shared.lib.detected_variables import VarCandidates
 
 _root_dir = os.path.dirname(
@@ -58,27 +56,20 @@ class TestEmbeddings(unittest.TestCase):
   def setUpClass(cls) -> None:
     embeddings_spec = _get_embeddings_spec()
     embeddings_info = _get_embeddings_info(embeddings_spec)
-    # TODO(pradh): Expand tests to other index sizes.
-    idx_info = embeddings_info.indexes[embeddings_spec['defaultIndex']]
-    model_info = embeddings_info.models[idx_info.model]
-    cls.nl_embeddings = Embeddings(
-        model=LocalSentenceTransformerModel(model_info),
-        store=MemoryEmbeddingsStore(idx_info))
+    cls.nl_embeddings = emb_map.EmbeddingsMap(embeddings_info).get_index(
+        embeddings_spec['defaultIndex'])
 
   @parameterized.expand([
       # All these queries should detect one of the SVs as the top choice.
       ["number of people", False, ["Count_Person"]],
       ["population of", False, ["dc/topic/Population", "Count_Person"]],
       ["economy of the state", False, ["dc/topic/Economy"]],
-      ["household income", False, ["Median_Income_Household"]],
+      ["household income", False, ["Mean_Income_Household"]],
       [
           "life expectancy in USA", False,
           ["dc/topic/LifeExpectancy", "LifeExpectancy_Person"]
       ],
-      [
-          "GDP", False,
-          ["Amount_EconomicActivity_GrossDomesticProduction_Nominal"]
-      ],
+      ["GDP", False, ["dc/topic/GDP"]],
       ["auto theft", False, ["Count_CriminalActivities_MotorVehicleTheft"]],
       ["agriculture", False, ["dc/topic/Agriculture"]],
       [
@@ -87,17 +78,20 @@ def setUpClass(cls) -> None:
       ],
       [
           "agriculture workers", False,
-          ["dc/hlxvn1t8b9bhh", "Count_Person_MainWorker_AgriculturalLabourers"]
+          ["dc/topic/Agriculture", "dc/15lrzqkb6n0y7"]
       ],
       [
-          "heart disease", False,
+          "coronary heart disease", False,
           [
               "dc/topic/HeartDisease",
               "dc/topic/PopulationWithDiseasesOfHeartByAge",
               "Percent_Person_WithCoronaryHeartDisease"
           ]
       ],
-      ["heart disease", True, ["Percent_Person_WithCoronaryHeartDisease"]],
+      [
+          "coronary heart disease", True,
+          ["Percent_Person_WithCoronaryHeartDisease"]
+      ],
   ])
   def test_sv_detection(self, query_str, skip_topics, expected_list):
     got = search_vars([self.nl_embeddings], [query_str],
@@ -110,7 +104,7 @@ def test_sv_detection(self, query_str, skip_topics, expected_list):
     self.assertTrue(sentences)
 
     # Check that the first SV found is among the expected_list.
-    self.assertTrue(svs[0] in expected_list)
+    self.assertTrue(svs[0] in expected_list, f"{svs[0]} not in {expected_list}")
 
     # TODO: uncomment the lines below when we have figured out what to do with these
     # assertion failures. They started failing when updating to the medium_ft index.
@@ -120,7 +114,7 @@ def test_sv_detection(self, query_str, skip_topics, expected_list):
     #                   ["AggCosineScore"])
 
   # For these queries, the match score should be low (< 0.45).
-  @parameterized.expand(["random random", "", "who where why", "__124__abc"])
+  @parameterized.expand(["random random", "who where why", "__124__abc"])
   def test_low_score_matches(self, query_str):
     got = search_vars([self.nl_embeddings], [query_str])[query_str]
 
@@ -132,4 +126,4 @@ def test_low_score_matches(self, query_str):
 
     # Check all scores.
     for score in scores:
-      self.assertLess(score, 0.45)
+      self.assertLess(score, 0.7)
diff --git a/run_test.sh b/run_test.sh
@@ -98,6 +98,7 @@ function run_py_test {
   # Run server pytest.
   source .env/bin/activate
   export FLASK_ENV=test
+  export TOKENIZERS_PARALLELISM=false
   # Disabled nodejs e2e test to avoid dependency on dev
   python3 -m pytest server/tests/ -s --ignore=server/tests/nodejs_e2e_test.py
   python3 -m pytest shared/tests/ -s
@@ -188,10 +189,7 @@ function update_integration_test_golden {
   export LLM_API_KEY=
   export ENABLE_EVAL_TOOL=true
 
-  export ENV_PREFIX=Autopush
-  python3 -m pytest -vv server/integration_tests/topic_cache
-  # Disabled nodejs e2e test to avoid dependency on dev
-  # python3 -m pytest -vv server/tests/nodejs_e2e_test.py
+  # Run integration test against staging mixer to make it stable.
   export ENV_PREFIX=Staging
   python3 -m pytest -vv -n 5 --reruns 2 server/integration_tests/
 }

diff --git a/server/integration_tests/explore_test.py b/server/integration_tests/explore_test.py
@@ -264,9 +264,7 @@ def test_detection_basic(self):
                        test='unittest')
 
   def test_detection_basic_lancedb(self):
-    # NOTE: Use the same test-name as above, since we expect the content to exactly
-    # match the one from above.
-    self.run_detection('detection_api_basic', ['Commute in California'],
+    self.run_detection('detection_api_basic_lancedb', ['Commute in California'],
                        test='unittest',
                        idx='medium_lance_ft')
 
@@ -318,14 +316,16 @@ def test_detection_bio(self):
                        check_detection=True)
 
   def test_detection_multivar(self):
-    self.run_detection('detection_api_multivar', [
-        'number of poor hispanic women with phd',
-        'compare obesity vs. poverty',
-        'show me the impact of climate change on drought',
-        'how are factors like obesity, blood pressure and asthma impacted by climate change',
-        'Compare "Male population" with "Female Population"',
-    ],
-                       check_detection=True)
+    self.run_detection(
+        'detection_api_multivar',
+        [
+            'number of poor hispanic women with phd',
+            # 'compare obesity vs. poverty',
+            'show me the impact of climate change on drought',
+            'how are factors like obesity, blood pressure and asthma impacted by climate change',
+            'Compare "Male population" with "Female Population"',
+        ],
+        check_detection=True)
 
   def test_detection_context(self):
     self.run_detection('detection_api_context', [

diff --git a/server/integration_tests/nl_test.py b/server/integration_tests/nl_test.py
@@ -35,7 +35,7 @@ class NLTest(NLWebServerTestCase):
   def run_sequence(self,
                    test_dir,
                    queries,
-                   idx='medium_ft',
+                   idx='base_uae_mem',
                    detector='hybrid',
                    check_place_detection=False,
                    expected_detectors=[],