From be04b2173466966764b6c8768199f4fa1a23e886 Mon Sep 17 00:00:00 2001 From: jordan Date: Mon, 25 Sep 2023 21:49:46 +0000 Subject: [PATCH] update query --- 01-bq-data-prep.ipynb | 2 +- 04-custom-train-retrieval.ipynb | 39 +- util/tb_profiling.ipynb | 1390 ++++++++++++++++++++++++++++++- 3 files changed, 1376 insertions(+), 55 deletions(-) diff --git a/01-bq-data-prep.ipynb b/01-bq-data-prep.ipynb index c114e86..18b20b5 100644 --- a/01-bq-data-prep.ipynb +++ b/01-bq-data-prep.ipynb @@ -901,7 +901,7 @@ " IFNULL(audio.tempo, 0.0) AS track_tempo_can,\n", " IFNULL(audio.time_signature, 0) AS track_time_signature_can,\n", " ARRAY(\n", - " SELECT CAST(pos_can - pos_pl \n", + " SELECT CAST(pos_can - pos_pl) AS FLOAT64\n", " FROM\n", " UNNEST(seed_playlist_tracks) t\n", " WHERE \n", diff --git a/04-custom-train-retrieval.ipynb b/04-custom-train-retrieval.ipynb index e8aea1b..5abf1ab 100644 --- a/04-custom-train-retrieval.ipynb +++ b/04-custom-train-retrieval.ipynb @@ -491,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 26, "id": "651cc211-4660-406d-a586-5e60ebfeb805", "metadata": {}, "outputs": [], @@ -548,7 +548,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 27, "id": "354feb4e-43c7-4238-9d4c-8a37b6278a62", "metadata": {}, "outputs": [ @@ -565,19 +565,6 @@ " '--candidate_file_dir=ndr-v1-hybrid-vertex-bucket',\n", " '--candidate_files_prefix=data/v1/candidates',\n", " '--experiment_name=scale-training-v1',\n", - " '--experiment_run=run-20230925-203116',\n", - " '--num_epochs=15',\n", - " '--batch_size=4096',\n", - " '--embedding_dim=128',\n", - " '--projection_dim=32',\n", - " '--layer_sizes=[512,256,128]',\n", - " '--learning_rate=0.01',\n", - " '--valid_frequency=7',\n", - " '--valid_steps=20',\n", - " '--epoch_steps=2003',\n", - " '--distribute=single',\n", - " '--model_version=v1',\n", - " '--pipeline_version=v1',\n", " '--seed=1234',\n", " '--max_tokens=20000',\n", " '--tb_resource_name=projects/934903580331/locations/us-central1/tensorboards/1356559854163984384',\n", @@ -768,7 +755,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 28, "id": "33a58184-e0c4-4403-ad80-da5fdb7dbaed", "metadata": {}, "outputs": [ @@ -792,33 +779,23 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 32, "id": "d432d552-4416-4f4c-964f-bcca57b60a07", "metadata": {}, "outputs": [], "source": [ "# %load_ext tensorboard\n", - "%reload_ext tensorboard" + "# %reload_ext tensorboard" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 31, "id": "171227db-d5f9-47e4-8b67-a528950233ab", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 23612." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ - "%tensorboard --logdir=$TB_LOGS_PATH" + "# %tensorboard --logdir=$TB_LOGS_PATH" ] }, { diff --git a/util/tb_profiling.ipynb b/util/tb_profiling.ipynb index e7604a2..5033c44 100644 --- a/util/tb_profiling.ipynb +++ b/util/tb_profiling.ipynb @@ -11,33 +11,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "0e1b7758-4ed1-48ec-b0ed-992dc6843df5", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-04-17 13:58:05.428116: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F AVX512_VNNI FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-04-17 13:58:05.562482: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", - "2023-04-17 13:58:06.289886: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64\n", - "2023-04-17 13:58:06.289998: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64\n", - "2023-04-17 13:58:06.290008: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.\n" - ] - } - ], + "outputs": [], "source": [ "# !pip install -U tensorboard_plugin_profile --user\n", "\n", + "import os\n", + "\n", + "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' \n", + "\n", "import tensorflow as tf\n", "import datetime, os" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "id": "fb5d34ec-0df5-4d9c-b8e2-a8a4038292fc", "metadata": {}, "outputs": [], @@ -65,14 +56,20 @@ "\n", "# path = 'gs://jt-tfrs-central/test-repo-v1-jtv12/run-20230109-205419/logs'\n", "\n", - "path = 'gs://jt-tfrs-central-v3/tfrs-pipe-e2e-v2/run-20230411-131340/logs' # \n", + "# path = 'gs://jt-tfrs-central-v3/tfrs-pipe-e2e-v2/run-20230411-131340/logs' # \n", + "\n", + "# path = 'gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230919-145442/logs'\n", + "\n", + "# K demo\n", + "path = 'gs://ndr-v1-hybrid-vertex-bucket/scale-training-v1/run-20230925-203116/logs'\n", + "# path = 'gs://ndr-v1-hybrid-vertex-bucket/tfrs-pipe-v1/run-20230925-203808/logs'\n", "\n", "\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "id": "cdd31a5c-748a-4a67-83c9-4ba10596e34d", "metadata": {}, "outputs": [], @@ -83,14 +80,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "id": "c0609eb8-c25b-470a-b246-f226e1760ce8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Reusing TensorBoard on port 6006 (pid 21068), started 0:14:48 ago. (Use '!kill 21068' to kill it.)" + "Reusing TensorBoard on port 6009 (pid 1432), started 0:15:23 ago. (Use '!kill 1432' to kill it.)" ] }, "metadata": {}, @@ -100,12 +97,12 @@ "data": { "text/html": [ "\n", - " \n", "