From 8bae60cd3389531f12e6adf63dcf175afc0e44b4 Mon Sep 17 00:00:00 2001 From: Keyur Shah Date: Wed, 26 Jun 2024 00:03:51 -0700 Subject: [PATCH 1/3] Build embeddings with the load data script. --- custom_dc/load_data.sh | 48 ++++++++++++++++++++++++---- tools/nl/embeddings/requirements.txt | 7 +++- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/custom_dc/load_data.sh b/custom_dc/load_data.sh index 5d978880e4..58a5db1436 100755 --- a/custom_dc/load_data.sh +++ b/custom_dc/load_data.sh @@ -82,9 +82,9 @@ function setup_python { echo_log "Installing Python requirements from $embeddings_req" run_cmd pip3 install -r "$embeddings_req" # TODO: remove install once embeddings doesn't need nl_server/requirements.txt - nlserver_req="$WEBSITE_DIR/nl_server/requirements.txt" - echo_log "Installing Python requirements from $nlserver_req" - run_cmd pip3 install -r "$nlserver_req" + # nlserver_req="$WEBSITE_DIR/nl_server/requirements.txt" + # echo_log "Installing Python requirements from $nlserver_req" + # run_cmd pip3 install -r "$nlserver_req" fi fi } @@ -223,9 +223,45 @@ function generate_embeddings { echo_log "Building embeddings for sentences in $NL_DIR" local cwd="$PWD" cd "$WEBSITE_DIR" - # TODO: Enable with new build_embeddings.py - # run_cmd python -m tools.nl.embeddings.build_custom_dc_embeddings \ - # --input_file_path="$NL_DIR/sentences.csv" --output_dir="$NL_DIR" + + NL_EMBEDDINGS_DIR="$NL_DIR/embeddings" + EMBEDDINGS_PATH="$NL_EMBEDDINGS_DIR/embeddings.csv" + CUSTOM_EMBEDDING_INDEX="user_all_minilm_mem" + CUSTOM_MODEL="ft-final-v20230717230459-all-MiniLM-L6-v2" + CUSTOM_MODEL_PATH="gs://datcom-nl-models/ft_final_v20230717230459.all-MiniLM-L6-v2" + CUSTOM_CATALOG_DICT=$(cat <> $LOG 2>&1 + set +x + status=$? + local duration=$(( $(date +%s) - $start_ts)) + [[ "$status" == "0" ]] || echo_fatal "Failed to build embeddings" + echo_log "Completed building embeddings with status:$status in $duration secs" cd "$cwd" } diff --git a/tools/nl/embeddings/requirements.txt b/tools/nl/embeddings/requirements.txt index 0f1447f0a7..d0c0854bf6 100644 --- a/tools/nl/embeddings/requirements.txt +++ b/tools/nl/embeddings/requirements.txt @@ -6,4 +6,9 @@ google-cloud-storage==2.15.0 lancedb==0.6.8 parameterized==0.8.1 sentence-transformers==2.2.2 -torchvision==0.17.2 \ No newline at end of file +torchvision==0.17.2 +# Downloading the named-entity recognition (NER) library spacy and the large EN model +# using the guidelines here: https://spacy.io/usage/models#production +# TODO: try using the large model +-f https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl +en_core_web_sm==3.7.1 \ No newline at end of file From 837bcbded7bf05007a203145039682fb896c4464 Mon Sep 17 00:00:00 2001 From: Keyur Shah Date: Wed, 26 Jun 2024 00:06:26 -0700 Subject: [PATCH 2/3] Remove nl_server requirements.txt. --- custom_dc/load_data.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/custom_dc/load_data.sh b/custom_dc/load_data.sh index 58a5db1436..fd043de00e 100755 --- a/custom_dc/load_data.sh +++ b/custom_dc/load_data.sh @@ -81,10 +81,6 @@ function setup_python { "https://download.pytorch.org/whl/cpu" echo_log "Installing Python requirements from $embeddings_req" run_cmd pip3 install -r "$embeddings_req" - # TODO: remove install once embeddings doesn't need nl_server/requirements.txt - # nlserver_req="$WEBSITE_DIR/nl_server/requirements.txt" - # echo_log "Installing Python requirements from $nlserver_req" - # run_cmd pip3 install -r "$nlserver_req" fi fi } From bb3bcd67467afde780d4982c759aa3644dd17476 Mon Sep 17 00:00:00 2001 From: Keyur Shah Date: Wed, 26 Jun 2024 03:28:00 -0700 Subject: [PATCH 3/3] Fix set status. --- custom_dc/load_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_dc/load_data.sh b/custom_dc/load_data.sh index fd043de00e..134944f7f3 100755 --- a/custom_dc/load_data.sh +++ b/custom_dc/load_data.sh @@ -253,8 +253,8 @@ EOF --embeddings_name "$CUSTOM_EMBEDDING_INDEX" \ --output_dir "$NL_EMBEDDINGS_DIR" \ --catalog "$CUSTOM_CATALOG_DICT" >> $LOG 2>&1 - set +x status=$? + set +x local duration=$(( $(date +%s) - $start_ts)) [[ "$status" == "0" ]] || echo_fatal "Failed to build embeddings" echo_log "Completed building embeddings with status:$status in $duration secs"