diff --git a/04-custom-train-retrieval.ipynb b/04-custom-train-retrieval.ipynb index 09e3d8a..e8aea1b 100644 --- a/04-custom-train-retrieval.ipynb +++ b/04-custom-train-retrieval.ipynb @@ -476,7 +476,7 @@ "# =================================================\n", "# trainconfig: Data sources\n", "# =================================================\n", - "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/train' # train\n", + "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/valid' # train\n", "VALID_DIR_PREFIX = f'data/{DATA_VERSION}/valid' \n", "CANDIDATE_PREFIX = f'data/{DATA_VERSION}/candidates' " ] @@ -768,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 21, "id": "33a58184-e0c4-4403-ad80-da5fdb7dbaed", "metadata": {}, "outputs": [ @@ -776,7 +776,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "TB_LOGS_PATH: gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230925-145451/logs\n" + "TB_LOGS_PATH: gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230925-203116/logs\n" ] } ], @@ -785,15 +785,14 @@ "\n", "import tensorflow as tf\n", "\n", - "# TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs'\n", - "TB_LOGS_PATH = \"gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230925-145451/logs\"\n", + "TB_LOGS_PATH = f'{BASE_OUTPUT_DIR}/logs'\n", "\n", "print(f\"TB_LOGS_PATH: {TB_LOGS_PATH}\")" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 24, "id": "d432d552-4416-4f4c-964f-bcca57b60a07", "metadata": {}, "outputs": [], @@ -804,31 +803,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 25, "id": "171227db-d5f9-47e4-8b67-a528950233ab", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "\n", - " \n", - " \n", - " " - ], "text/plain": [ - "" + "ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 23612." ] }, "metadata": {}, diff --git a/07-train-pipeline.ipynb b/07-train-pipeline.ipynb index 268365f..703048c 100644 --- a/07-train-pipeline.ipynb +++ b/07-train-pipeline.ipynb @@ -247,7 +247,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "MODEL_ROOT_NAME: sp-2tower-tfrs-v1-pipev1\n" + "MODEL_ROOT_NAME: sp-2tower-tfrs-v1-pipe_v2\n" ] } ], @@ -2212,15 +2212,14 @@ "output_type": "stream", "text": [ "EXPERIMENT_NAME: tfrs-pipe-v1\n", - "RUN_NAME: run-20230922-202528\n" + "RUN_NAME: run-20230925-203808\n" ] } ], "source": [ "EXPERIMENT_PREFIX = 'tfrs-pipe' # custom identifier for organizing experiments\n", "EXPERIMENT_NAME=f'{EXPERIMENT_PREFIX}-{VERSION}'\n", - "# RUN_NAME = f'run-{time.strftime(\"%Y%m%d-%H%M%S\")}'\n", - "RUN_NAME = f'run-20230922-202528'\n", + "RUN_NAME = f'run-{time.strftime(\"%Y%m%d-%H%M%S\")}'\n", "\n", "print(f\"EXPERIMENT_NAME: {EXPERIMENT_NAME}\")\n", "print(f\"RUN_NAME: {RUN_NAME}\")" @@ -2248,7 +2247,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "PIPELINE_ROOT_PATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root\n" + "PIPELINE_ROOT_PATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root\n" ] } ], @@ -2282,7 +2281,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "FEATURES_PREFIX: tfrs-pipe-v1/run-20230922-202528/features\n" + "FEATURES_PREFIX: tfrs-pipe-v1/run-20230925-203808/features\n" ] } ], @@ -2512,7 +2511,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "VALID_FREQUENCY : 35\n", + "VALID_FREQUENCY : 23\n", "VALID_STEPS : 20\n", "EPOCH_STEPS : 2003\n", "EMBED_FREQUENCY : 1\n", @@ -2602,7 +2601,7 @@ "output_type": "stream", "text": [ "CANDIDATE_PREFIX: data/v1/candidates\n", - "TRAIN_DIR_PREFIX: data/v1/valid\n", + "TRAIN_DIR_PREFIX: data/v1/train\n", "VALID_DIR_PREFIX: data/v1/valid\n" ] } @@ -2611,7 +2610,7 @@ "# =================================================\n", "# trainconfig: Data sources\n", "# =================================================\n", - "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/valid' # train\n", + "TRAIN_DIR_PREFIX = f'data/{DATA_VERSION}/train' # train\n", "VALID_DIR_PREFIX = f'data/{DATA_VERSION}/valid' \n", "CANDIDATE_PREFIX = f'data/{DATA_VERSION}/candidates' \n", "\n", @@ -2642,40 +2641,40 @@ "[{'container_spec': {'args': ['--project=hybrid-vertex',\n", " '--train_output_gcs_bucket=ndr-v1-hybrid-vertex-bucket',\n", " '--train_dir=ndr-v1-hybrid-vertex-bucket',\n", - " '--train_dir_prefix=data/v1/valid',\n", + " '--train_dir_prefix=data/v1/train',\n", " '--valid_dir=ndr-v1-hybrid-vertex-bucket',\n", " '--valid_dir_prefix=data/v1/valid',\n", " '--candidate_file_dir=ndr-v1-hybrid-vertex-bucket',\n", " '--candidate_files_prefix=data/v1/candidates',\n", " '--experiment_name=tfrs-pipe-v1',\n", - " '--experiment_run=run-20230922-202528',\n", + " '--experiment_run=run-20230925-203808',\n", " '--num_epochs=70',\n", " '--batch_size=4096',\n", " '--embedding_dim=128',\n", " '--projection_dim=32',\n", " '--layer_sizes=[512,256,128]',\n", " '--learning_rate=0.01',\n", - " '--valid_frequency=35',\n", + " '--valid_frequency=23',\n", " '--valid_steps=20',\n", " '--epoch_steps=2003',\n", " '--distribute=single',\n", " '--model_version=v1',\n", - " '--pipeline_version=pipev1',\n", + " '--pipeline_version=pipe_v2',\n", " '--seed=1234',\n", " '--max_tokens=20000',\n", " '--embed_frequency=1',\n", + " '--update_frequency=500',\n", " '--hist_frequency=0',\n", " '--tf_gpu_thread_count=8',\n", " '--block_length=64',\n", " '--num_data_shards=4',\n", " '--chkpt_freq=500',\n", " '--dropout_rate=0.33',\n", - " '--cache_train',\n", " '--compute_batch_metrics',\n", " '--use_cross_layer',\n", " '--use_dropout'],\n", " 'command': ['python', '-m', 'src.two_tower_jt.task'],\n", - " 'image_uri': 'us-central1-docker.pkg.dev/hybrid-vertex/ndr-v1-spotify/train-v1'},\n", + " 'image_uri': 'us-central1-docker.pkg.dev/hybrid-vertex/ndr-v1-spotify/train-v1:latest'},\n", " 'machine_spec': {'accelerator_count': 1,\n", " 'accelerator_type': 'NVIDIA_TESLA_T4',\n", " 'machine_type': 'n1-highmem-16'},\n", @@ -2724,7 +2723,7 @@ " f'--chkpt_freq={CHECKPOINT_FREQ}',\n", " f'--dropout_rate={DROPOUT_RATE}',\n", " # uncomment these to pass value of True (bool)\n", - " f'--cache_train', # caches train_dataset\n", + " # f'--cache_train', # caches train_dataset\n", " # f'--evaluate_model', # runs model.eval()\n", " # f'--write_embeddings', # writes embeddings index in train job\n", " # f'--profiler', # runs TB profiler\n", @@ -2735,7 +2734,7 @@ "]\n", "\n", "WORKER_POOL_SPECS = workerpool_specs.prepare_worker_pool_specs(\n", - " image_uri=REMOTE_IMAGE_NAME,\n", + " image_uri=f\"{REMOTE_IMAGE_NAME}:latest\",\n", " args=WORKER_ARGS,\n", " cmd=WORKER_CMD,\n", " replica_count=REPLICA_COUNT,\n", @@ -2761,7 +2760,7 @@ "output_type": "stream", "text": [ "/home/jupyter/jw-repo2/spotify_mpd_two_tower\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root\n", "src\n" ] } @@ -2793,7 +2792,7 @@ { "data": { "text/plain": [ - "'gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528'" + "'gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808'" ] }, "execution_count": 40, @@ -2819,7 +2818,7 @@ "output_type": "stream", "text": [ "\n", - " Copied training package and Dockerfile to gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root\n", + " Copied training package and Dockerfile to gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root\n", "\n" ] } @@ -2849,16 +2848,16 @@ "name": "stdout", "output_type": "stream", "text": [ - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/__init__.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/feature_sets.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/interactive_train.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/requirements.txt\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/task.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/test_instances.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/train_config.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/train_utils.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/two_tower.py\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/__pycache__/\n" + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/__init__.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/feature_sets.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/interactive_train.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/requirements.txt\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/task.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/test_instances.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/train_config.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/train_utils.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/two_tower.py\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/__pycache__/\n" ] } ], @@ -2884,8 +2883,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "PIPELINE_TAG: 2tower-pipev1\n", - "PIPELINE_NAME: tfrs-v1-2tower-pipev1\n" + "PIPELINE_TAG: 2tower-pipe_v2\n", + "PIPELINE_NAME: tfrs-v1-2tower-pipe-v2\n" ] } ], @@ -3326,7 +3325,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "PIPELINES_FILEPATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/pipeline_spec.json\n" + "PIPELINES_FILEPATH: gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/pipeline_spec.json\n" ] } ], @@ -3349,10 +3348,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/Dockerfile_tfrs\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/pipeline_spec.json\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/934903580331/\n", - "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230922-202528/pipeline_root/trainer/\n" + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/Dockerfile_tfrs\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/pipeline_spec.json\n", + "gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/pipeline_root/trainer/\n" ] } ],