diff --git a/04-custom-train-retrieval.ipynb b/04-custom-train-retrieval.ipynb
index 09e3d8a..e8aea1b 100644
--- a/04-custom-train-retrieval.ipynb
+++ b/04-custom-train-retrieval.ipynb
@@ -476,7 +476,7 @@
"# =================================================\n",
"# trainconfig: Data sources\n",
"# =================================================\n",
- "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/train' # train\n",
+ "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/valid' # train\n",
"VALID_DIR_PREFIX = f'data/{DATA_VERSION}/valid' \n",
"CANDIDATE_PREFIX = f'data/{DATA_VERSION}/candidates' "
]
@@ -768,7 +768,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 21,
"id": "33a58184-e0c4-4403-ad80-da5fdb7dbaed",
"metadata": {},
"outputs": [
@@ -776,7 +776,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "TB_LOGS_PATH: gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230925-145451/logs\n"
+ "TB_LOGS_PATH: gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230925-203116/logs\n"
]
}
],
@@ -785,15 +785,14 @@
"\n",
"import tensorflow as tf\n",
"\n",
- "# TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs'\n",
- "TB_LOGS_PATH = \"gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230925-145451/logs\"\n",
+ "TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs'\n",
"\n",
"print(f\"TB_LOGS_PATH: {TB_LOGS_PATH}\")"
]
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 24,
"id": "d432d552-4416-4f4c-964f-bcca57b60a07",
"metadata": {},
"outputs": [],
@@ -804,31 +803,14 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 25,
"id": "171227db-d5f9-47e4-8b67-a528950233ab",
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "\n",
- " \n",
- " \n",
- " "
- ],
"text/plain": [
- ""
+ "ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 23612."
]
},
"metadata": {},
diff --git a/07-train-pipeline.ipynb b/07-train-pipeline.ipynb
index 268365f..703048c 100644
--- a/07-train-pipeline.ipynb
+++ b/07-train-pipeline.ipynb
@@ -247,7 +247,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "MODEL_ROOT_NAME: sp-2tower-tfrs-v1-pipev1\n"
+ "MODEL_ROOT_NAME: sp-2tower-tfrs-v1-pipe_v2\n"
]
}
],
@@ -2212,15 +2212,14 @@
"output_type": "stream",
"text": [
"EXPERIMENT_NAME: tfrs-pipe-v1\n",
- "RUN_NAME: run-20230922-202528\n"
+ "RUN_NAME: run-20230925-203808\n"
]
}
],
"source": [
"EXPERIMENT_PREFIX = 'tfrs-pipe' # custom identifier for organizing experiments\n",
"EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{VERSION}'\n",
- "# RUN_NAME = f'run-{time.strftime(\"%Y%m%d-%H%M%S\")}'\n",
- "RUN_NAME = f'run-20230922-202528'\n",
+ "RUN_NAME = f'run-{time.strftime(\"%Y%m%d-%H%M%S\")}'\n",
"\n",
"print(f\"EXPERIMENT_NAME: {EXPERIMENT_NAME}\")\n",
"print(f\"RUN_NAME: {RUN_NAME}\")"
@@ -2248,7 +2247,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "PIPELINE_ROOT_PATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root\n"
+ "PIPELINE_ROOT_PATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root\n"
]
}
],
@@ -2282,7 +2281,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "FEATURES_PREFIX: tfrs-pipe-v1/run-20230922-202528/features\n"
+ "FEATURES_PREFIX: tfrs-pipe-v1/run-20230925-203808/features\n"
]
}
],
@@ -2512,7 +2511,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "VALID_FREQUENCY : 35\n",
+ "VALID_FREQUENCY : 23\n",
"VALID_STEPS : 20\n",
"EPOCH_STEPS : 2003\n",
"EMBED_FREQUENCY : 1\n",
@@ -2602,7 +2601,7 @@
"output_type": "stream",
"text": [
"CANDIDATE_PREFIX: data/v1/candidates\n",
- "TRAIN_DIR_PREFIX: data/v1/valid\n",
+ "TRAIN_DIR_PREFIX: data/v1/train\n",
"VALID_DIR_PREFIX: data/v1/valid\n"
]
}
@@ -2611,7 +2610,7 @@
"# =================================================\n",
"# trainconfig: Data sources\n",
"# =================================================\n",
- "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/valid' # train\n",
+ "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/train' # train\n",
"VALID_DIR_PREFIX = f'data/{DATA_VERSION}/valid' \n",
"CANDIDATE_PREFIX = f'data/{DATA_VERSION}/candidates' \n",
"\n",
@@ -2642,40 +2641,40 @@
"[{'container_spec': {'args': ['--project=hybrid-vertex',\n",
" '--train_output_gcs_bucket=ndr-v1-hybrid-vertex-bucket',\n",
" '--train_dir=ndr-v1-hybrid-vertex-bucket',\n",
- " '--train_dir_prefix=data/v1/valid',\n",
+ " '--train_dir_prefix=data/v1/train',\n",
" '--valid_dir=ndr-v1-hybrid-vertex-bucket',\n",
" '--valid_dir_prefix=data/v1/valid',\n",
" '--candidate_file_dir=ndr-v1-hybrid-vertex-bucket',\n",
" '--candidate_files_prefix=data/v1/candidates',\n",
" '--experiment_name=tfrs-pipe-v1',\n",
- " '--experiment_run=run-20230922-202528',\n",
+ " '--experiment_run=run-20230925-203808',\n",
" '--num_epochs=70',\n",
" '--batch_size=4096',\n",
" '--embedding_dim=128',\n",
" '--projection_dim=32',\n",
" '--layer_sizes=[512,256,128]',\n",
" '--learning_rate=0.01',\n",
- " '--valid_frequency=35',\n",
+ " '--valid_frequency=23',\n",
" '--valid_steps=20',\n",
" '--epoch_steps=2003',\n",
" '--distribute=single',\n",
" '--model_version=v1',\n",
- " '--pipeline_version=pipev1',\n",
+ " '--pipeline_version=pipe_v2',\n",
" '--seed=1234',\n",
" '--max_tokens=20000',\n",
" '--embed_frequency=1',\n",
+ " '--update_frequency=500',\n",
" '--hist_frequency=0',\n",
" '--tf_gpu_thread_count=8',\n",
" '--block_length=64',\n",
" '--num_data_shards=4',\n",
" '--chkpt_freq=500',\n",
" '--dropout_rate=0.33',\n",
- " '--cache_train',\n",
" '--compute_batch_metrics',\n",
" '--use_cross_layer',\n",
" '--use_dropout'],\n",
" 'command': ['python', '-m', 'src.two_tower_jt.task'],\n",
- " 'image_uri': 'us-central1-docker.pkg.dev/hybrid-vertex/ndr-v1-spotify/train-v1'},\n",
+ " 'image_uri': 'us-central1-docker.pkg.dev/hybrid-vertex/ndr-v1-spotify/train-v1:latest'},\n",
" 'machine_spec': {'accelerator_count': 1,\n",
" 'accelerator_type': 'NVIDIA_TESLA_T4',\n",
" 'machine_type': 'n1-highmem-16'},\n",
@@ -2724,7 +2723,7 @@
" f'--chkpt_freq={CHECKPOINT_FREQ}',\n",
" f'--dropout_rate={DROPOUT_RATE}',\n",
" # uncomment these to pass value of True (bool)\n",
- " f'--cache_train', # caches train_dataset\n",
+ " # f'--cache_train', # caches train_dataset\n",
" # f'--evaluate_model', # runs model.eval()\n",
" # f'--write_embeddings', # writes embeddings index in train job\n",
" # f'--profiler', # runs TB profiler\n",
@@ -2735,7 +2734,7 @@
"]\n",
"\n",
"WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(\n",
- " image_uri=REMOTE_IMAGE_NAME,\n",
+ " image_uri=f\"{REMOTE_IMAGE_NAME}:latest\",\n",
" args=WORKER_ARGS,\n",
" cmd=WORKER_CMD,\n",
" replica_count=REPLICA_COUNT,\n",
@@ -2761,7 +2760,7 @@
"output_type": "stream",
"text": [
"/home/jupyter/jw-repo2/spotify_mpd_two_tower\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root\n",
"src\n"
]
}
@@ -2793,7 +2792,7 @@
{
"data": {
"text/plain": [
- "'gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528'"
+ "'gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808'"
]
},
"execution_count": 40,
@@ -2819,7 +2818,7 @@
"output_type": "stream",
"text": [
"\n",
- " Copied training package and Dockerfile to gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root\n",
+ " Copied training package and Dockerfile to gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root\n",
"\n"
]
}
@@ -2849,16 +2848,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/__init__.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/feature_sets.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/interactive_train.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/requirements.txt\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/task.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/test_instances.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/train_config.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/train_utils.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/two_tower.py\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/__pycache__/\n"
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/__init__.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/feature_sets.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/interactive_train.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/requirements.txt\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/task.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/test_instances.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/train_config.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/train_utils.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/two_tower.py\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/__pycache__/\n"
]
}
],
@@ -2884,8 +2883,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "PIPELINE_TAG: 2tower-pipev1\n",
- "PIPELINE_NAME: tfrs-v1-2tower-pipev1\n"
+ "PIPELINE_TAG: 2tower-pipe_v2\n",
+ "PIPELINE_NAME: tfrs-v1-2tower-pipe-v2\n"
]
}
],
@@ -3326,7 +3325,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "PIPELINES_FILEPATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/pipeline_spec.json\n"
+ "PIPELINES_FILEPATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/pipeline_spec.json\n"
]
}
],
@@ -3349,10 +3348,9 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/Dockerfile_tfrs\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/pipeline_spec.json\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/934903580331/\n",
- "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/\n"
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/Dockerfile_tfrs\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/pipeline_spec.json\n",
+ "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/\n"
]
}
],