From b2d38be264136655365e0796001d4b13023098b7 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Tue, 13 Jun 2023 15:11:13 -0500 Subject: [PATCH 01/10] Add new mkdocs feature to suppress INFO output from nav INFO - The following pages exist in the docs directory, but are not included in the "nav" configuration: --- mkdocs.yml | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index a847aafdb..c3ee6c984 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -279,6 +279,60 @@ nav: - Data and Software Policies: - Data Policy: policies/data-and-software-policies/data-policy.md - Software Policy: policies/data-and-software-policies/software-policy.md +not_in_nav: | + account-project-management/allocation-management/sbank-detail-allocations.md + account-project-management/allocation-management/sbank-detail-jobs.md + account-project-management/allocation-management/sbank-detail-projects.md + account-project-management/allocation-management/sbank-detail-transactions.md + account-project-management/allocation-management/sbank-detail-users.md + account-project-management/allocation-management/sbank-detail.md + account-project-management/allocation-management/sbank-examples.md + account-project-management/allocation-management/sbank-list-allocations.md + account-project-management/allocation-management/sbank-list-jobs.md + account-project-management/allocation-management/sbank-list-projects.md + account-project-management/allocation-management/sbank-list-transactions.md + account-project-management/allocation-management/sbank-list-users.md + account-project-management/allocation-management/sbank-list.md + account-project-management/allocation-management/sbank-manpage.md + ai-testbed/howto-contribute.md + ai-testbed/cerebras/README.md + ai-testbed/cerebras/performance-tools.md + ai-testbed/files/notes.md + ai-testbed/files/todo.md + ai-testbed/graphcore/README.md + ai-testbed/graphcore/Scaling-ResNet50.md + ai-testbed/graphcore/cosmictagger-conversion.md + ai-testbed/graphcore/cosmictagger-ddp.md + ai-testbed/graphcore/multi-node-setup.md + ai-testbed/graphcore/profiling-mnist.md + ai-testbed/graphcore/profiling-resnet50.md + ai-testbed/graphcore/profiling.md + ai-testbed/habana/getting-started.md + ai-testbed/sambanova_gen1/TODO.md + ai-testbed/sambanova_gen1/performance-tools.md + ai-testbed/sambanova_gen1/readme-rick.md + ai-testbed/sambanova_gen1/readme.md + ai-testbed/sambanova_gen1/readme_rick_02.md + ai-testbed/sambanova_gen1/running-bert-large-on-sn10-8r.md + ai-testbed/sambanova_gen1/sambanova.md + ai-testbed/sambanova_gen1/sambatune-user-guide.md + ai-testbed/sambanova_gen2/README.md + ai-testbed/sambanova_gen2/TODO.md + ai-testbed/sambanova_gen2/cosmictagger-conversion.md + ai-testbed/sambanova_gen2/performance-tools.md + ai-testbed/sambanova_gen2/running-GPT2.md + ai-testbed/sambanova_gen2/sambanova.md + ai-testbed/sambanova_gen2/sambatune-user-guide.md + ai-testbed/sambanova_gen2/not_published/running-GPT2-multi-node.md + ai-testbed/sambanova_gen2/not_published/running-bert-large-on-sn30.md + ai-testbed/sambanova_gen2/not_published/sambatune-user-guide.md + polaris/compiling-and-linking/continuous-integration-polaris.md + polaris/debugging-tools/debugging-overview.md + polaris/performance-tools/performance-overview.md + running-jobs/gronkulator.md + running-jobs/pbs-admin-quick-start-guide.md + services/index.md + theta/debugging-tools/debugging-overview.md theme: name: 'material' diff --git a/requirements.txt b/requirements.txt index f3c919456..2d21f0fc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ mkdocs-material -mkdocs +mkdocs>=1.5.0 mkdocs-video mkdocs-include-markdown-plugin mkdocs-codeinclude-plugin>=0.2.1 From 1cf46589c425c3d1fa4e5fc0eaa0a6c033b6a063 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 27 Jul 2023 17:57:51 -0500 Subject: [PATCH 02/10] Start moving unused docs to special dirs --- .../sambanova_gen1/{ => unused}/running-bert-large-on-sn10-8r.md | 1 + 1 file changed, 1 insertion(+) rename docs/ai-testbed/sambanova_gen1/{ => unused}/running-bert-large-on-sn10-8r.md (99%) diff --git a/docs/ai-testbed/sambanova_gen1/running-bert-large-on-sn10-8r.md b/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md similarity index 99% rename from docs/ai-testbed/sambanova_gen1/running-bert-large-on-sn10-8r.md rename to docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md index a03fdee70..c35a94a23 100644 --- a/docs/ai-testbed/sambanova_gen1/running-bert-large-on-sn10-8r.md +++ b/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md @@ -1,5 +1,6 @@ # Steps to Run BERT-Large on Sambanova DataScale SN10-8R + * BERT Code is in the [Bert](./bert/) directory here for your reference. * [transformners_hook.py](./bert/transformers_hook.py): contains code for BERT. From 919abd5c8daed0e2cea860573f45ff4c8c30ba3e Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 27 Jul 2023 18:20:29 -0500 Subject: [PATCH 03/10] Cleanup habana, sn1, etc. --- .../allocation-management.md | 2 +- .../sbank-detail-allocations.md | 0 .../{ => not_in_nav}/sbank-detail-jobs.md | 0 .../{ => not_in_nav}/sbank-detail-projects.md | 0 .../sbank-detail-transactions.md | 0 .../{ => not_in_nav}/sbank-detail-users.md | 0 .../{ => not_in_nav}/sbank-detail.md | 0 .../{ => not_in_nav}/sbank-examples.md | 0 .../sbank-list-allocations.md | 0 .../{ => not_in_nav}/sbank-list-jobs.md | 0 .../{ => not_in_nav}/sbank-list-projects.md | 0 .../sbank-list-transactions.md | 0 .../{ => not_in_nav}/sbank-list-users.md | 0 .../{ => not_in_nav}/sbank-list.md | 0 .../{ => not_in_nav}/sbank-manpage.md | 0 .../sbank-allocation-accounting-system.md | 28 +- docs/ai-testbed/habana/getting-started.md | 67 ----- .../sambanova_gen1/example-programs.md | 2 +- .../{ => files}/2022-09-21T19-21-05.html | 0 .../{ => files}/bw_unet_compile_run_all.sh | 0 .../{ => files}/ccle_09_19_22_11_50.log | 0 .../{ => files}/tmpeo5ehksn.html | 0 .../sambanova_gen1/{ => files}/unet_all.sh | 0 .../sambanova_gen1/{ => files}/unet_batch.sh | 0 .../{ => files}/unet_compile_run_all.sh | 0 .../{ => files}/unet_compile_run_inf_rl.sh | 0 .../unet_compile_run_parallel_all.sh | 0 .../sambanova_gen1/{ => files}/uno.yaml | 0 .../{ => files}/uno_bruce_tmp.yaml | 0 .../{ => files}/uno_brw_CCLE.yaml | 0 .../{ => files}/uno_rick_tmp.yaml | 0 docs/ai-testbed/sambanova_gen1/readme-rick.md | 253 ------------------ docs/ai-testbed/sambanova_gen1/readme.md | 15 -- .../sambanova_gen1/readme_rick_02.md | 22 -- .../{ => unused}/performance-tools.md | 0 .../unused/running-bert-large-on-sn10-8r.md | 3 +- .../sambanova_gen1/{ => unused}/sambanova.md | 0 .../{ => unused}/sambatune-user-guide.md | 0 .../continuous-integration-polaris.md | 4 - .../debugging-tools/debugging-overview.md | 2 - .../performance-tools/performance-overview.md | 3 - mkdocs.yml | 34 +-- 42 files changed, 23 insertions(+), 412 deletions(-) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-detail-allocations.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-detail-jobs.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-detail-projects.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-detail-transactions.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-detail-users.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-detail.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-examples.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-list-allocations.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-list-jobs.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-list-projects.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-list-transactions.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-list-users.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-list.md (100%) rename docs/account-project-management/allocation-management/{ => not_in_nav}/sbank-manpage.md (100%) delete mode 100644 docs/ai-testbed/habana/getting-started.md rename docs/ai-testbed/sambanova_gen1/{ => files}/2022-09-21T19-21-05.html (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/bw_unet_compile_run_all.sh (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/ccle_09_19_22_11_50.log (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/tmpeo5ehksn.html (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/unet_all.sh (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/unet_batch.sh (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/unet_compile_run_all.sh (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/unet_compile_run_inf_rl.sh (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/unet_compile_run_parallel_all.sh (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/uno.yaml (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/uno_bruce_tmp.yaml (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/uno_brw_CCLE.yaml (100%) rename docs/ai-testbed/sambanova_gen1/{ => files}/uno_rick_tmp.yaml (100%) delete mode 100644 docs/ai-testbed/sambanova_gen1/readme-rick.md delete mode 100644 docs/ai-testbed/sambanova_gen1/readme.md delete mode 100644 docs/ai-testbed/sambanova_gen1/readme_rick_02.md rename docs/ai-testbed/sambanova_gen1/{ => unused}/performance-tools.md (100%) rename docs/ai-testbed/sambanova_gen1/{ => unused}/sambanova.md (100%) rename docs/ai-testbed/sambanova_gen1/{ => unused}/sambatune-user-guide.md (100%) delete mode 100644 docs/polaris/compiling-and-linking/continuous-integration-polaris.md delete mode 100644 docs/polaris/debugging-tools/debugging-overview.md delete mode 100644 docs/polaris/performance-tools/performance-overview.md diff --git a/docs/account-project-management/allocation-management/allocation-management.md b/docs/account-project-management/allocation-management/allocation-management.md index d41d69d61..d7450ef86 100644 --- a/docs/account-project-management/allocation-management/allocation-management.md +++ b/docs/account-project-management/allocation-management/allocation-management.md @@ -9,7 +9,7 @@ For information on how to run the query, look at our documentation on our [sbank ## Using sbank to Determine the Balance of an Allocation To determine which platforms have an active balance, check our allocation accounting system [sbank](sbank-allocation-accounting-system.md). -- To obtain the allocation balance, check the sbank command [sbank-list-allocations](sbank-list-allocations.md). +- To obtain the allocation balance, check the sbank command [sbank-list-allocations](not_in_nav/sbank-list-allocations.md). - DD projects with a negative balance will not be able to run jobs until they have requested additional time, see Getting more time below. - INCITE and ALCC PIs automatically email a summary of project usage. If this is a DD project, please email [support@alcf.anl.gov](mailto:support@alcf.anl.gov). diff --git a/docs/account-project-management/allocation-management/sbank-detail-allocations.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-allocations.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-allocations.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-allocations.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-jobs.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-jobs.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-jobs.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-jobs.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-projects.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-projects.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-projects.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-projects.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-transactions.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-transactions.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-transactions.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-transactions.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-users.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-users.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-users.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-users.md diff --git a/docs/account-project-management/allocation-management/sbank-detail.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail.md diff --git a/docs/account-project-management/allocation-management/sbank-examples.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-examples.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-examples.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-examples.md diff --git a/docs/account-project-management/allocation-management/sbank-list-allocations.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-allocations.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-allocations.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-allocations.md diff --git a/docs/account-project-management/allocation-management/sbank-list-jobs.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-jobs.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-jobs.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-jobs.md diff --git a/docs/account-project-management/allocation-management/sbank-list-projects.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-projects.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-projects.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-projects.md diff --git a/docs/account-project-management/allocation-management/sbank-list-transactions.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-transactions.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-transactions.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-transactions.md diff --git a/docs/account-project-management/allocation-management/sbank-list-users.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-users.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-users.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-users.md diff --git a/docs/account-project-management/allocation-management/sbank-list.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list.md diff --git a/docs/account-project-management/allocation-management/sbank-manpage.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-manpage.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-manpage.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-manpage.md diff --git a/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md b/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md index fa2d1657c..b4d1a8475 100644 --- a/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md +++ b/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md @@ -5,22 +5,22 @@ The sbank accounting system helps users manage their allocations and usage per j ## Getting Started with sbank -[sbank Example Commands](sbank-examples.md) provides a set of example commands on how to use the most common commands. +[sbank Example Commands](not_in_nav/sbank-examples.md) provides a set of example commands on how to use the most common commands. ## sbank Man Pages Use these sbank man pages to get information on how to use the commands. -- [sbank](sbank-manpage.md) -- [sbank-detail](sbank-detail.md) -- [sbank-detail-allocations](sbank-detail-allocations.md) -- [sbank-detail-jobs](sbank-detail-jobs.md) -- [sbank-detail-projects](sbank-detail-projects.md) -- [sbank-detail-transactions](sbank-detail-transactions.md) -- [sbank-detail-users](sbank-detail-users.md) -- [sbank-list](sbank-list.md) -- [sbank-list-allocations](sbank-list-allocations.md) -- [sbank-list-jobs](sbank-list-jobs.md) -- [sbank-list-projects](sbank-list-projects.md) -- [sbank-list-transactions](sbank-list-transactions.md) -- [sbank-list-users](sbank-list-users.md) +- [sbank](not_in_nav/sbank-manpage.md) +- [sbank-detail](not_in_nav/sbank-detail.md) +- [sbank-detail-allocations](not_in_nav/sbank-detail-allocations.md) +- [sbank-detail-jobs](not_in_nav/sbank-detail-jobs.md) +- [sbank-detail-projects](not_in_nav/sbank-detail-projects.md) +- [sbank-detail-transactions](not_in_nav/sbank-detail-transactions.md) +- [sbank-detail-users](not_in_nav/sbank-detail-users.md) +- [sbank-list](not_in_nav/sbank-list.md) +- [sbank-list-allocations](not_in_nav/sbank-list-allocations.md) +- [sbank-list-jobs](not_in_nav/sbank-list-jobs.md) +- [sbank-list-projects](not_in_nav/sbank-list-projects.md) +- [sbank-list-transactions](not_in_nav/sbank-list-transactions.md) +- [sbank-list-users](not_in_nav/sbank-list-users.md) diff --git a/docs/ai-testbed/habana/getting-started.md b/docs/ai-testbed/habana/getting-started.md deleted file mode 100644 index e7973c99e..000000000 --- a/docs/ai-testbed/habana/getting-started.md +++ /dev/null @@ -1,67 +0,0 @@ -# Getting Started - -## On-Boarding - -See [Get Started](https://www.alcf.anl.gov/support-center/get-started) -to request an acccount and additional information. - -## Setup - -### System View - -Connection to a Sambanova node is a two step process. First step is to ssh to a "login node". -This step requires a MFA passcode for authentication - a 8 digit passcode generated by an app on your mobile device (e.g. mobilePASS+). -The second step is to login to a sambanova node from the login node. -In the examples below, replace ALCFUserID with your ALCF user id. -![SambaNova System View](Log_in.png "SambaNova System View") - -### Login to Login Node - -Login to the SambaNova login node from your local machine using the below command. This uses the MobilPass+ token generated everytime you login to the system. This is the same passcode used to authenticate into other ALCF systems, such as Theta and Cooley. - -```bash -ssh ALCFUserID@sambanova.alcf.anl.gov -ALCFUserID@sambanova.alcf.anl.govs password: < MobilPass+ code > -``` - -Note: Use the ssh "-v" option in order to debug any ssh problems. - -### Login to SambaNova Node - -Once you are on the login node, the sambanova system can be accessed using the alias “sm-01” that resolves to hostname sm-01.ai.alcf.anl.gov. - -```bash -ssh sm-01 -``` - -### SDK setup - -The SambaNova system has a bash shell script to setup the required software environment. -This sets up the SambaFlow software stack, the associated environmental variables and activates -a pre-configured virtual environment. - -Use - -```bash -ALCFUserID@sm-01:~$ source /software/sambanova/envs/sn_env.sh -(venv) ALCFUserID@sm-01:~$ -``` - -The contents of the sn_env.sh script is shown below for convenience. - -```bash -alias snpath='export PATH=$PATH:/opt/sambaflow/bin' # This is the path to SambaFlow which is the software stack that is running on SambaNova systems. This stack includes the Runtime, the compilers, and the SambaFlow Python SDK which is used to create and run models. - -alias snthreads='export OMP_NUM_THREADS=1' # The OMP_NUM_THREADS environment variable sets the number of threads to use for parallel regions. The value of this environment variable must be a list of positive integer values. The values of the list set the number of threads to use for parallel regions at the corresponding nested levels.For the SambaNova system it is usually set to 1. - -alias snvenv='source /opt/sambaflow/venv/bin/activate' # This activates the pre-configured virtual environment that consists of sambaflow and other built-in libraries. -``` - -**NOTE: SambaNova operations will fail unless the SambaNova venv is set -up.** - -You may deactivate the environment if finished. - -```bash -deactivate -``` diff --git a/docs/ai-testbed/sambanova_gen1/example-programs.md b/docs/ai-testbed/sambanova_gen1/example-programs.md index e2cb50fd8..0a7434190 100644 --- a/docs/ai-testbed/sambanova_gen1/example-programs.md +++ b/docs/ai-testbed/sambanova_gen1/example-programs.md @@ -273,7 +273,7 @@ cp -r /opt/sambaflow/apps/image ~/apps/image cd ~/apps/image/unet ``` -Using the contents of [unet_compile_run_inf_rl.sh](unet_compile_run_inf_rl.sh), create a file in the current directory with the same name. +Using the contents of [unet_compile_run_inf_rl.sh](files/unet_compile_run_inf_rl.sh), create a file in the current directory with the same name. Export the path to the dataset which is required for the training. diff --git a/docs/ai-testbed/sambanova_gen1/2022-09-21T19-21-05.html b/docs/ai-testbed/sambanova_gen1/files/2022-09-21T19-21-05.html similarity index 100% rename from docs/ai-testbed/sambanova_gen1/2022-09-21T19-21-05.html rename to docs/ai-testbed/sambanova_gen1/files/2022-09-21T19-21-05.html diff --git a/docs/ai-testbed/sambanova_gen1/bw_unet_compile_run_all.sh b/docs/ai-testbed/sambanova_gen1/files/bw_unet_compile_run_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/bw_unet_compile_run_all.sh rename to docs/ai-testbed/sambanova_gen1/files/bw_unet_compile_run_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/ccle_09_19_22_11_50.log b/docs/ai-testbed/sambanova_gen1/files/ccle_09_19_22_11_50.log similarity index 100% rename from docs/ai-testbed/sambanova_gen1/ccle_09_19_22_11_50.log rename to docs/ai-testbed/sambanova_gen1/files/ccle_09_19_22_11_50.log diff --git a/docs/ai-testbed/sambanova_gen1/tmpeo5ehksn.html b/docs/ai-testbed/sambanova_gen1/files/tmpeo5ehksn.html similarity index 100% rename from docs/ai-testbed/sambanova_gen1/tmpeo5ehksn.html rename to docs/ai-testbed/sambanova_gen1/files/tmpeo5ehksn.html diff --git a/docs/ai-testbed/sambanova_gen1/unet_all.sh b/docs/ai-testbed/sambanova_gen1/files/unet_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_all.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_batch.sh b/docs/ai-testbed/sambanova_gen1/files/unet_batch.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_batch.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_batch.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_compile_run_all.sh b/docs/ai-testbed/sambanova_gen1/files/unet_compile_run_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_compile_run_all.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_compile_run_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_compile_run_inf_rl.sh b/docs/ai-testbed/sambanova_gen1/files/unet_compile_run_inf_rl.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_compile_run_inf_rl.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_compile_run_inf_rl.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_compile_run_parallel_all.sh b/docs/ai-testbed/sambanova_gen1/files/unet_compile_run_parallel_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_compile_run_parallel_all.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_compile_run_parallel_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/uno.yaml b/docs/ai-testbed/sambanova_gen1/files/uno.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno.yaml diff --git a/docs/ai-testbed/sambanova_gen1/uno_bruce_tmp.yaml b/docs/ai-testbed/sambanova_gen1/files/uno_bruce_tmp.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno_bruce_tmp.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno_bruce_tmp.yaml diff --git a/docs/ai-testbed/sambanova_gen1/uno_brw_CCLE.yaml b/docs/ai-testbed/sambanova_gen1/files/uno_brw_CCLE.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno_brw_CCLE.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno_brw_CCLE.yaml diff --git a/docs/ai-testbed/sambanova_gen1/uno_rick_tmp.yaml b/docs/ai-testbed/sambanova_gen1/files/uno_rick_tmp.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno_rick_tmp.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno_rick_tmp.yaml diff --git a/docs/ai-testbed/sambanova_gen1/readme-rick.md b/docs/ai-testbed/sambanova_gen1/readme-rick.md deleted file mode 100644 index fb71a39f1..000000000 --- a/docs/ai-testbed/sambanova_gen1/readme-rick.md +++ /dev/null @@ -1,253 +0,0 @@ -# SambaTune - -## Notes - -```text -#TODOBRW -ssh wilsonb@homes.cels.anl.gov -ssh sm-02 -MobilePass+ password -On sm-02 -source /opt/sambaflow/venv/bin/activate -sambatune_ui --directory /home/wilsonb/tmp/sambatune_gen --port 8580 -#There will be a username and password displayed that you will use in your browser on your laptop. -Command used on laptop for port forward -ssh -XL 8580:127.0.0.1:8580 wilsonb@sm-02.cels.anl.gov -MobilePass+ password -# You will be logged into sm-02 but, you do not need to do anything. -address used in browser on laptop localhost:8580 -#Use username and password from sambatune_ui. -Username -Password -``` - -### Rick - -8/24/2022 - -I have updated ~rweisner/tmp/sambatune with sambatune_ui 1.1 and updated the readme. - -## About SambaTune - -SambaTune is a tool for profiling, debugging, and tuning performance of applications -running on SN hardware. - -The tool automates collection of hardware performance counters, metrics aggregation, -report generation, and visualization. It also automates benchmarking of the application -to compute average throughput over a sufficient number of runs. The tool is designed to -aid the user with performance bottleneck analysis and tuning. - -SambaTune is currently used by SN engineers involved in performance tuning efforts. -SambaTune is also planned for release to external customers to aid with performance -bottleneck analysis and resolution. - -## Installation - -```bash -ssh wilsonb@sambanova.alcf.anl.gov -MobilePass+ pwd -ssh sm-01 -``` - -First, enter the virtual environment on **sm-01** or **sm-02**: - -```bash -source /opt/sambaflow/venv/bin/activate -``` - -## Usage - -```console -usage: sambatune [-h] [--artifact-root ARTIFACT_ROOT] [--disable-override] -[--compile-only | -m MODES [MODES ...]] -[--version] -config -positional arguments: -config -YAML file with model, compile, run configuration. -optional arguments: --h, --help ---artifact-root -show this help message and exit -ARTIFACT_ROOT -Custom location to save compile/run artifacts; -defaults to '$DUMP_ROOT/artifact_root' ---disable-override Reuse the placement from the baseline compilation ---compile-only Run compilation of PEFs for selected modes only --m MODES [MODES ...], --modes MODES [MODES ...] -Select modes to execute from ['benchmark', -'instrument', 'run'] default: ['benchmark'] ---version -version of sambatune and sambaflow. -``` - -## Command Overview - -By default, it will run with the benchmarking mode enabled. Use the --modes flag to run -modes individually or in any combination. -Benchmark-Only: - -```bash -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes benchmark -``` - -Instrument-Only: - -```bash -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes instrument -``` - -All modes: - -```bash -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes instrument -``` - -## Command Example - -### Running - -Run the following example on **sm-01** or **sm-02**: - -```bash -mkdir ~/sambatune -cd ~/sambatune -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes benchmark instrument run -``` - -where **small_vae.yaml** is a user-specified configuration file: - -### Samples Config File - -The current directory should be **~/sambatune**. - -Create **small_vae.yaml** with the following content using your favorite editor. - -```yaml -small_vae.yaml: -app: /opt/sambaflow/apps/private/anl/moleculevae.py - -model-args: -b 128 --in-width 512 --in-height 512 - -compile-args: compile --plot --enable-conv-tiling --compiler-configs-file /opt/sambaflow/apps/private/anl/moleculevae/compiler_configs_conv.json --mac-v2 --mac-human-decision /opt/sambaflow/apps/private/anl/moleculevae/symmetric_human_decisions_tiled_v2.json - -run-args: --num-iterations 1000 --input-path /var/tmp/dataset/moleculevae/ras1_prot-pops.h5 --out-path ${HOME}/moleculevae_out --model-id 0 --epochs 10 - -env: - OMP_NUM_THREADS: 16 - SF_RNT_FSM_POLL_BUSY_WAIT: 1 - SF_RNT_DMA_POLL_BUSY_WAIT: 1 - CONVFUNC_DEBUG_RUN: 0 -``` - -## Install SambaTune UI on Your Development Machine - -### Copy Conda Tar File on SambaNova - -On sambanova.alcf.anl.gov: - -```bash -mkdir ~/tmp -cd ~/tmp -cp /home/rweisner/tmp/sambatune/sambatune_1.1.tar . -``` - -### Copy Conda Tar File To Your Dev Machine - -On your dev machine: - -```bash -mkdir /tmp -cd /tmp -scp ALCFUserID@sambanova:tmp/sambatune/sambatune_1.1.tar . -# Or -scp ac.rick.weisner@lambda0:tmp/sambatune/sambatune_1.1.tar . -# Or -scp wilsonb@sambanova:tmp/sambatune/sambatune_1.1.tar . -``` - -### Install Docker - -If necessary: - -```bash -sudo apt-get install docker -# Or -sudo snap install docker -``` - -### Docker - -If you have changed directories: - -```bash -cd /tmp -``` - -Load Docker image: - -```bash -sudo docker image load -i sambatune_1.1.tar -``` - -List Docker images: - -```bash -sudo docker image ls -``` - -Your output will look something like: - -```text -REPOSITORY TAG IMAGE ID CREATED SIZE -artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client 1.1 bf1d5834776d 3 months ago 737MB -``` - -This is the image you want -artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client 1.1 bf1d5834776d 3 months ago 737MB - -### Run the Docker Container - -Make a work directory: - -```bash -mkdir -p /path/to/work -# Or -mkdir -p /home/bwilson/sambatune/work -``` - -Run the container: - -```bash -sudo docker container run --mount type=bind,source=/path/to/work,target=/work -it -p 5050:8576 artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client:1.1 -# Or -sudo docker container run --mount type=bind,source=/home/bwilson/sambatune/work,target=/work -it -p 5050:8576 artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client:1.1 -``` - -The first time you run the above command, you will see many layers being loaded. It will load immediate from then on. - -My artifact_root is in /Users/rickw/work/vae_tst/artifact_root. - -Start the UI: -It will tell you the port and password. - -sambatune_ui --directory /work/lincoln/vae_tst/artifact_root/sambatune_gen - -You will see something like: -root@477a49bd9e55:/project# sambatune_ui --directory /work/lincoln/vae_tst/artifact_root/sambatune_gen -Starting server on localhost:8576 with the following directory: /work/lincoln/vae_tst/artifact_root/sambatune_gen -with the, - username: "admin", password: "fd11af8a-edad-11ec-89c9-0242ac110002" - * Serving Flask app 'sambatune.uiwebserver' (lazy loading) - * Environment: production - WARNING: This is a development server. Do not use it in a production deployment. - Use a production WSGI server instead. - * Debug mode: off - * Running on all addresses. - WARNING: This is a development server. Do not use it in a production deployment. - * Running on http://172.17.0.2:8576/ (Press CTRL+C to quit) - -RCW: use localhost:8576 to connect - - -Now connect via browser. diff --git a/docs/ai-testbed/sambanova_gen1/readme.md b/docs/ai-testbed/sambanova_gen1/readme.md deleted file mode 100644 index aeb861e89..000000000 --- a/docs/ai-testbed/sambanova_gen1/readme.md +++ /dev/null @@ -1,15 +0,0 @@ -# SambaNova Documentation - -* compiler-options.pdf -* getting-started.pdf -* intro-tutorial-pytorch.pdf -* release-notes.pdf -* run-examples-language.pdf -* run-examples-pytorch.pdf -* run-examples-vision.pdf -* runtime-faq.pdf -* slurm-sambanova.pdf -* snconfig-userguide.pdf -* sntilestat-manpage.pdf -* using-layernorm.pdf -* using-venvs.pdf diff --git a/docs/ai-testbed/sambanova_gen1/readme_rick_02.md b/docs/ai-testbed/sambanova_gen1/readme_rick_02.md deleted file mode 100644 index 7ee9bbb77..000000000 --- a/docs/ai-testbed/sambanova_gen1/readme_rick_02.md +++ /dev/null @@ -1,22 +0,0 @@ -# Notes - -```bash -source /software/sambanova/envs/sn_env.sh -source ~/.bashrc -cd ~/tmp -cp -rf /home/rweisner/tmp/unet . -cd ~/tmp/unet -export OUTDIR=~/apps/image/unet -export DATADIR=/software/sambanova/dataset/kaggle_3m -sbatch --gres=rdu:1 --tasks-per-node 4 --nodes 2 --nodelist sm-02,sm-01 --cpus-per-task=16 ./unet_batch.sh ${NP} ${NUM_WORKERS} -./unet_compile_run_all.sh compile 256 256 -ll -``` - -```console -On sm-01 or sm-02 -/home/rweisner/tmp/gpt for mpirun -/home/rweisner/tmp/unet -unet_compile_run_all.sh for slurm -./unet_compile_run_all.sh compile 256 256 -``` diff --git a/docs/ai-testbed/sambanova_gen1/performance-tools.md b/docs/ai-testbed/sambanova_gen1/unused/performance-tools.md similarity index 100% rename from docs/ai-testbed/sambanova_gen1/performance-tools.md rename to docs/ai-testbed/sambanova_gen1/unused/performance-tools.md diff --git a/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md b/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md index c35a94a23..d059725d2 100644 --- a/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md +++ b/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md @@ -1,8 +1,7 @@ # Steps to Run BERT-Large on Sambanova DataScale SN10-8R -* BERT Code is in the [Bert](./bert/) directory here for your reference. - * [transformners_hook.py](./bert/transformers_hook.py): contains code for BERT. + Bruce -python /opt/sambaflow/apps/private/anl/uno_full.py compile --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --pef-name=uno_16_4_500_ws --output-folder='.' --mac-v1 - -export OMP_NUM_THREADS=1 -python /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial --pef=./uno_16_4_500_ws/uno_16_4_500_ws.pef --in_dir /var/tmp/raw/ --mac-v1 -``` - -```text -#TODOBRW This works. 9/19/22 -sm-01/home/wilsonb/tmp/uno_test/uno_ccle.yaml -app: /opt/sambaflow/apps/private/anl/uno_full.py - -model-args: --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial - -compile-args: compile --plot --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 - -run-args: --multiprocess-pickle --use-pickle-train --measure-spatial --train-samba-spatial --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_500 --converted-pickle - -env: - OMP_NUM_THREADS: 16, - SF_RNT_NUMA_BIND: 2 -``` - -Run the following example: - -```bash -sambatune uno_ccle.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run -``` - -```bash -#TODOBRW -# Stand-alone -export UNO=. -export NS=500 -srun python /opt/sambaflow/apps/private/anl/uno_full.py compile --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --pef-name=uno_16_4_${NS}_ws --output-folder='.' --mac-v1 - -export OMP_NUM_THREADS=1 -srun python /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} - -export UNO=. -export NS=500 -export OMP_NUM_THREADS=1 -srun pyinstrument /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} > pyinstrument_1.13.log 2>&1 - - - -Ricks run python ${UNO}/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=“out/uno_16_4_${NS}/uno_16_4_${NS}.pef” --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE -``` - -```text -#TODOBRW -sm-01/home/wilsonb/DL/Sambanova/apps_1.12/private/anl/uno_brw_CCLE_1_12.yaml -export OMP_NUM_THREADS=16 -app: /home/wilsonb/DL/Sambanova/apps_1.12/private/anl/uno_full.py - -model-args: --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial - -compile-args: compile --plot --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 - -run-args: --measure-spatial --train-samba-spatial --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_500 - -env: - OMP_NUM_THREADS: 16, - SF_RNT_NUMA_BIND: 2 -``` - -Run the following example: - -```bash -sambatune uno_brw_CCLE_1_12.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run - -export UNO=. -export NS=50 -export OMP_NUM_THREADS=1 - -srun python /opt/sambaflow/apps/private/anl/uno_full.py compile --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 - -xsrun pyinstrument /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} --epochs 1 > my.log 2>&1 - -srun python /opt/sambaflow/apps/private/anl/uno_full.py run --multiprocess-pickle --measure-spatial --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./out/uno_full_16_47_${NS}/uno_full_16_47_${NS}.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_${NS} > pyinstrument_1.13.log 2>&1 - -cat my.log # Has pyinstrument run name. -pyinstrument --load-prev 2022-09-21T19-21-05 -r html - - -1.13 - -source /opt/sambaflow/venv/bin/activate -cd ~/tmp/uno_test/ -export UNO=. -export NS=500 -export OMP_NUM_THREADS=1 -export PATH=/opt/sambaflow/bin:$PATH -sntilestat - - - -./uno_pickl.sh compile 500 -./uno_pickl.sh run 500 - -``` - -```bash -sambatune uno_brw_CCLE_1_12.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run - -export UNO=. -export NS=50 -export OMP_NUM_THREADS=1 - -srun python /opt/sambaflow/apps/private/anl/uno_full.py compile --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 - -xsrun pyinstrument /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} --epochs 1 > my.log 2>&1 - -srun python /opt/sambaflow/apps/private/anl/uno_full.py run --multiprocess-pickle --measure-spatial --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./out/uno_full_16_47_${NS}/uno_full_16_47_${NS}.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_${NS} > pyinstrument_1.13.log 2>&1 - -cat my.log # Has pyinstrument run name. -pyinstrument --load-prev 2022-09-21T19-21-05 -r html - - -1.13 - -source /opt/sambaflow/venv/bin/activate -cd ~/tmp/uno_test/ -export UNO=. -export NS=500 -export OMP_NUM_THREADS=1 -export PATH=/opt/sambaflow/bin:$PATH -sntilestat -``` - -uno_pickl.sh - -```bash -#! /bin/bash -x -#set -e -source /opt/sambaflow/venv/bin/activate -SECONDS=0 -NS=${2} -UNO=/opt/sambaflow/apps/private/anl/ -DS="ALL" -DS="CCLE" - -BS=$((NS*16)) -export OMP_NUM_THREADS=16 - -echo "Model: UNO_SPA_TRN" -echo "Date: " $(date +%m/%d/%y) -echo "Time: " $(date +%H:%M) -if [ "${1}" == "convert" ] ; then -python3 ${UNO}/uno/uno_data_loaders_converted.py --in_dir /var/tmp/raw/ --out_dir /software/sambanova/dataset/${DS}_16_${NS} --batch-size ${BS} --train_sources ${DS} --file-write-frequency 10 - - -elif [ "${1}" == "compile" ] ; then - echo "COMPILE" - python ${UNO}/uno_full.py compile --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --mac-human-decision ${UNO}/samba_uno/human_decisions_spatial.json --pef-name="uno_16_4_${NS}" --mac-v1 - - -elif [ "${1}" == "run" ] ; then - echo "RUN ${DS}" - SF_RNT_NUMA_BIND=2 - #python ${UNO}/uno_full.py run --acc-test --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE - python ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --epochs 1 - #python ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" - -elif [ "${1}" == "pyinstrument" ] ; then - echo "RUN ${DS}" - SF_RNT_NUMA_BIND=2 - #python ${UNO}/uno_full.py run --acc-test --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE - pyinstrument ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --epochs 1 - #python ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" - -elif [ "${1}" == "no_pickle" ] ; then - echo "no_pickle ${DS}" - SF_RNT_NUMA_BIND=2 - python ${UNO}/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE - -elif [ "${1}" == "mp" ] ; then -echo "Duration: " $SECONDS - -elif [ "${1}" == "mp" ] ; then -echo "Duration: " $SECONDS -echo "PERF" -python uno_full.py measure-performance --measure-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --num-iterations 20 --mac-v1 -fi - -echo "Duration: " $SECONDS -``` - -```bash -./uno_pickl.sh compile 500 -./uno_pickl.sh run 500 -./uno_pickl.sh pyinstrument 500 -pyinstrument --load-prev 2022-09-22T18-31-24 -r html -stdout is a terminal, so saved profile output to /tmp/tmpeo5ehksn.html -cp /tmp/tmpeo5ehksn.html . -``` - -On dev terminal - -```bash -scp wilsonb@sambanova.alcf.anl.gov:tmp/uno_test/tmpeo5ehksn.html . -``` - -View in local browser. - -### Running - -Create a directory for your work. - -```bash -mkdir ~/sambatune -cd ~/sambatune -``` - -Create **small_vae.yaml** with the following content using your favorite editor. - -```yaml -app: /opt/sambaflow/apps/private/anl/moleculevae.py - -model-args: -b 128 --in-width 512 --in-height 512 - -compile-args: compile --plot --enable-conv-tiling --compiler-configs-file /opt/sambaflow/apps/private/anl/moleculevae/compiler_configs_conv.json --mac-v2 --mac-human-decision /opt/sambaflow/apps/private/anl/moleculevae/symmetric_human_decisions_tiled_v2.json - -run-args: --input-path /var/tmp/dataset/moleculevae/ras1_prot-pops.h5 --out-path ${HOME}/moleculevae_out --model-id 0 --epochs 10 - -env: - OMP_NUM_THREADS: 16 - SF_RNT_FSM_POLL_BUSY_WAIT: 1 - SF_RNT_DMA_POLL_BUSY_WAIT: 1 - CONVFUNC_DEBUG_RUN: 0 -``` - -Run the following example: - -```bash -sambatune small_vae.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run -``` - -Create **linear_net.yaml** with the following content using your favorite editor. - -```yaml -app: /opt/sambaflow/apps/micros/linear_net.py - -model-args: > - -b 1024 - -mb 64 - --in-features 8192 - --out-features 4096 - --repeat 128 - --inference - -compile-args: > - --n-chips 2 - --plot - -env: - SF_RNT_FSM_POLL_BUSY_WAIT: 1 - SF_RNT_DMA_POLL_BUSY_WAIT: 1 - CONVFUNC_DEBUG_RUN": 0 -``` - -**NOTE:** The following takes 45 minutes to run. - -Run the following example: - -```bash -sambatune linear_net.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run -``` - -```bash -#TODOBRW -cd ~/tmp/uno_test -screen -sambatune uno.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run -``` - -where **linear_net.yaml** is a user-specified configuration file you created above. - -## SambaTune UI - -### Port Availability - -It is recommended that you check if the port you want to use is available. You may check by: - -```bash -ps -elf | grep desired_port -``` - -Example: - -```bash -ps -elf | grep 8576 -``` - -Alternatively, you may check for all ports in use by **sambatune_ui**: - -```bash -ps -elf | grep sambatune_ui -``` - -If you need to free a port that you are finished with, you may use the **kill** command. - -### Start SambaTune UI - -If you followed the above directions, your artifact_root will be at ~/sambatune/artifact_root. - -Start the UI: - -It will tell you the **username** and **password**. - -**NOTE:** It is recommended to use a port other than **8576** in case someone else is using it. Select another port close to **8576**. - -Next - -```bash -sambatune_ui --directory ~/sambatune/artifact_root/sambatune_gen/ --port 8576 -``` - -```bash -#TODOBRW -sambatune_ui --directory ~/sambatune/artifact_root/sambatune_gen/ --port 8580 -sambatune_ui --directory /home/wilsonb/tmp/uno_test/artifact_root/sambatune_gen --port 8580 -username: "admin", password: "4f7cac2c-351e-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "aaf1fc88-35c8-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "bf64e4f8-3831-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "8feca89e-384c-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "355222d6-3a88-11ed-93a3-f7ef9c6e5d46" -``` - -You will see something like: - -```console -with the, - username: "admin", password: "05c63938-2941-11ed-93a3-f7ef9c6e5d46" -[2022-08-31 15:24:36 +0000] [1344959] [Info] Starting gunicorn 20.1.0 -[2022-08-31 15:24:36 +0000] [1344959] [Info] Listening at: http://0.0.0.0:8576 (1344959) -[2022-08-31 15:24:36 +0000] [1344959] [Info] Using worker: sync -[2022-08-31 15:24:36 +0000] [1345092] [Info] Booting worker with pid: 1345092 -[2022-08-31 15:24:36 +0000] [1345093] [Info] Booting worker with pid: 1345093 -``` - -**NOTE:** Write down the username and password. - -**NOTE:** The password only works with this one instance of sambatune_ui. If you stop this instance of sambatune_ui and start another instance, it will have a new password. - -**NOTE:** You will need to **>** or use the **kill** command to stop sambatune_ui when you have finished. -Not doing so will tie up the port. -You can **ps -elf | grep the_port_you_used** to find the running processes. -If you are not comfortable doing this, please ask for help. - -## Use Port-Forwarding - -This describes the steps to set up port-forwarding for applications, -like SambaTune UI, which runs on the SambaNova system and binds to one or more ports. -This example uses 8576 and 18576 as port numbers. **Using port numbers other than these may -avoid collisions with other users.** - -### From your local machine - -This command sets up a port forward SambaNova login node to your local machine. - -Run - -```bash -ssh -N -f -L localhost:18576:localhost:18576 ALCFUserID@sambanova.alcf.anl.gov -... -Password: < MobilePass+ code > - -ssh ALCFUserID@sambanova.alcf.anl.gov -``` - -```bash -#TODOBRW -ssh -v -N -f -L localhost:8580:localhost:8580 wilsonb@sambanova.alcf.anl.gov -ssh -N -f -L localhost:8580:localhost:8580 wilsonb@sambanova.alcf.anl.gov -... -Password: < MobilePass+ code > - -ssh wilsonb@sambanova.alcf.anl.gov -``` - -*replacing* ***ALCFUserID*** *with your ALCF User ID.* - -### From **sambanova.alcf.anl.gov** - -This command sets up a port forward from a SambaNova node to the sambanova login machine. - -Below are the commands specific to sm-01. You may replace **sm-01** with **sm-02** when using that system. - -Run - -**NOTE: The full name is sm-01.ai.alcf.anl.gov and it may also be used.** - -```bash -ssh -N -f -L localhost:18576:localhost:8576 ALCFUserID@sm-01 -``` - -```bash -#TODOBRW -ssh -N -f -L localhost:8580:localhost:8580 wilsonb@sm-01 -``` - -### Browser on Local Machine - -Then, navigate in your browser to, in this example, [http://localhost:18576](http://localhost:18576) on your local machine. - -Use the username and password from **sm-01** to log in. - -## SSH Notes - -Explanation of **ssh** command: - -```text --N : no remote commands - --f : put ssh in the background - --L ::: : - -The full command line will forward : (local scope) to : (remote scope) -``` - -Adapted from: [How can I run Tensorboard on a remote server?](https://stackoverflow.com/questions/37987839/how-can-i-run-tensorboard-on-a-remote-server) diff --git a/docs/ai-testbed/sambanova_gen2/unused/sambatune-user-guide.md b/docs/ai-testbed/sambanova_gen2/unused/sambatune-user-guide.md index 370835fc9..7a9362606 100644 --- a/docs/ai-testbed/sambanova_gen2/unused/sambatune-user-guide.md +++ b/docs/ai-testbed/sambanova_gen2/unused/sambatune-user-guide.md @@ -2,6 +2,9 @@ ## Notes +Rick 4/16/2023 [10:16 AM] +/home/rweisner/sambatune_ui_dir contains the 1.15.3 version which is the latest released version. It should work on your experimental. You will need browser access to wherever you install it. + ```bash cd /home/rweisner/tmp/uno_test ``` @@ -32,133 +35,379 @@ Password ## About SambaTune -SambaTune is a tool for profiling and performance tuning of applications that are running on SambaNova DataScale hardware. +SambaTune is a tool for profiling, debugging, and tuning the performance of applications +running on SN hardware. The tool automates the collection of hardware performance counters, metrics aggregation, report generation, and visualization. It also automates benchmarking of the application to compute average throughput over a sufficient number of runs. The tool is designed to aid the user with performance bottleneck analysis and tuning. +SambaTune is currently used by SN engineers involved in performance tuning efforts. +SambaTune is also planned for release to external customers to aid with performance +bottleneck analysis and resolution. + ## Run SambaTune ```bash ssh ALCFUserID@sambanova.alcf.anl.gov # Enter MobilePass+ pass code -ssh sn30-r1-h1 +ssh sm-01 ``` ```bash #TODOBRW ssh wilsonb@sambanova.alcf.anl.gov # Enter MobilePass+ pass code -ssh sn30-r1-h1 +ssh sm-01 ``` -## TODO +First, enter the virtual environment on **sm-01** or **sm-02**: + +```bash +source /opt/sambaflow/venv/bin/activate +``` -Install the SambaTune package on the host that is connected to the SambaNova hardware. +Update path: ```bash -sudo apt install -y sambanova-sambatune -sudo apt install -y sambaflow-apps-micros +export PATH=/opt/sambaflow/bin:$PATH ``` -## SambaTune Client Installation +## Usage -TODO: Waiting for Rick to make a .whl file available. +```console +usage: sambatune [-h] [--artifact-root ARTIFACT_ROOT] [--disable-override] + [--compile-only | -m MODES [MODES ...]] [--version] + config -## Establish Files +positional arguments: + config YAML file with model, compile, run configuration. -A sample application, linear_net.py is included with your installation at /opt/sambaflow/apps/micros/linear_net.py. +optional arguments: + -h, --help show this help message and exit + --artifact-root ARTIFACT_ROOT + Custom location to save compile/run artifacts; + defaults to '$DUMP_ROOT/artifact_root' (default: None) + --disable-override Reuse the placement from the baseline compilation + (default: False) + --compile-only Run compilation of PEFs for selected modes only + (default: False) + -m MODES [MODES ...], --modes MODES [MODES ...] + Select modes to execute from ['benchmark', + 'instrument', 'run'] (default: ['benchmark']) + --version version of sambatune and sambaflow. +``` -### Set Up +## Command Overview -Create the following directory and change to it if you have not already done so. +By default, it will run with the benchmarking mode enabled. Use the --modes flag to run +modes individually or in any combination. +Benchmark-Only: -```console -mkdir ~/app-test -cd ~/app-test +```bash +sambatune example_net.yaml --artifact-root $(pwd)/artifact_root --modes benchmark ``` -### Copy linear_net.py +Instrument-Only: -A sample application, linear_net.py, is included with your installation at /opt/sambaflow/apps/micros/linear_net.py. +```bash +sambatune example_net.yaml --artifact-root $(pwd)/artifact_root --modes instrument +``` -Copy the file to the current directory: +All modes: ```bash -cp /opt/sambaflow/apps/micros/linear_net.py . +sambatune example_net.yaml --artifact-root $(pwd)/artifact_root --modes instrument ``` -### Create linear_net.yaml +## Command Example -Create the file **linear_net.yaml** in the current directory using your favorite editor. -Copy the following **yaml**. +```bash +# From Bill +python /opt/sambaflow/apps/private/anl/uno_full.py compile --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --pef-name=uno_16_4_500_ws --output-folder=/home/arnoldw//models_dir/1520847 --mac-v1 -```yaml -app: linear.py -model-args: -b 128 -mb 64 --in-features 512 --out-features 128 -compile-args: compile --plot -run-args: -n 10000 +python /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial --pef=/home/arnoldw//models_dir/1520847/uno_16_4_500_ws/uno_16_4_500_ws.pef --in_dir /var/tmp/raw/ --mac-v1 ``` -## Command Overview +```bash +# From Bill --> Bruce +python /opt/sambaflow/apps/private/anl/uno_full.py compile --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --pef-name=uno_16_4_500_ws --output-folder='.' --mac-v1 -By default, it will run with the benchmarking mode enabled. Use the --modes flag to run -modes individually or in any combination. -Benchmark-Only: +export OMP_NUM_THREADS=1 +python /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial --pef=./uno_16_4_500_ws/uno_16_4_500_ws.pef --in_dir /var/tmp/raw/ --mac-v1 +``` + +```text +#TODOBRW This works. 9/19/22 +sm-01/home/wilsonb/tmp/uno_test/uno_ccle.yaml +app: /opt/sambaflow/apps/private/anl/uno_full.py + +model-args: --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial + +compile-args: compile --plot --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 + +run-args: --multiprocess-pickle --use-pickle-train --measure-spatial --train-samba-spatial --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_500 --converted-pickle + +env: + OMP_NUM_THREADS: 16, + SF_RNT_NUMA_BIND: 2 +``` + +Run the following example: ```bash -sambatune linear_net.yaml +sambatune uno_ccle.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run ``` -Run the application in instrument-only mode. +```bash +#TODOBRW +# Stand-alone +export UNO=. +export NS=500 +srun python /opt/sambaflow/apps/private/anl/uno_full.py compile --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --pef-name=uno_16_4_${NS}_ws --output-folder='.' --mac-v1 -> **Note**: The space after -- is required. +export OMP_NUM_THREADS=1 +srun python /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} -$ sambatune --modes instrument -- /opt/sambaflow/sambatune/configs/linear_net.yaml +export UNO=. +export NS=500 +export OMP_NUM_THREADS=1 +srun pyinstrument /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} > pyinstrument_1.13.log 2>&1 -Run in all modes. -> **Note**: The space after -- is required. -$ sambatune --modes benchmark instrument run -- /opt/sambaflow/sambatune/configs/linear_net.yaml +Ricks run python ${UNO}/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=“out/uno_16_4_${NS}/uno_16_4_${NS}.pef” --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE +``` -## Usage +```text +#TODOBRW +sm-01/home/wilsonb/DL/Sambanova/apps_1.12/private/anl/uno_brw_CCLE_1_12.yaml +export OMP_NUM_THREADS=16 +app: /home/wilsonb/DL/Sambanova/apps_1.12/private/anl/uno_full.py -> TODO Update the help +model-args: --weight-sharing -b 16 -mb 4 --num-spatial-batches 500 --mapping spatial -```console -usage: sambatune [-h] [--artifact-root ARTIFACT_ROOT] [--disable-override] - [--compile-only | -m MODES [MODES ...]] [--version] - config +compile-args: compile --plot --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 -positional arguments: - config YAML file with model, compile, run configuration. +run-args: --measure-spatial --train-samba-spatial --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_500 -optional arguments: - -h, --help show this help message and exit - --artifact-root ARTIFACT_ROOT - Custom location to save compile/run artifacts; - defaults to '$DUMP_ROOT/artifact_root' (default: None) - --disable-override Reuse the placement from the baseline compilation - (default: False) - --compile-only Run compilation of PEFs for selected modes only - (default: False) - -m MODES [MODES ...], --modes MODES [MODES ...] - Select modes to execute from ['benchmark', - 'instrument', 'run'] (default: ['benchmark']) - --version version of sambatune and sambaflow. +env: + OMP_NUM_THREADS: 16, + SF_RNT_NUMA_BIND: 2 ``` -## Run the sample application +Run the following example: + +```bash +sambatune uno_brw_CCLE_1_12.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run + +export UNO=. +export NS=50 +export OMP_NUM_THREADS=1 + +srun python /opt/sambaflow/apps/private/anl/uno_full.py compile --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 + +xsrun pyinstrument /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} --epochs 1 > my.log 2>&1 + +srun python /opt/sambaflow/apps/private/anl/uno_full.py run --multiprocess-pickle --measure-spatial --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./out/uno_full_16_47_${NS}/uno_full_16_47_${NS}.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_${NS} > pyinstrument_1.13.log 2>&1 -A sample application, **linear_net.py** is included with your installation at /opt/sambaflow/apps/micros/linear_net.py. +cat my.log # Has pyinstrument run name. +pyinstrument --load-prev 2022-09-21T19-21-05 -r html +1.13 +source /opt/sambaflow/venv/bin/activate +cd ~/tmp/uno_test/ +export UNO=. +export NS=500 +export OMP_NUM_THREADS=1 +export PATH=/opt/sambaflow/bin:$PATH +sntilestat + + + +./uno_pickl.sh compile 500 +./uno_pickl.sh run 500 + +``` + +```bash +sambatune uno_brw_CCLE_1_12.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run +export UNO=. +export NS=50 +export OMP_NUM_THREADS=1 + +srun python /opt/sambaflow/apps/private/anl/uno_full.py compile --mac-human-decision /opt/sambaflow/apps/private/anl/samba_uno/human_decisions_spatial.json --mac-v1 + +xsrun pyinstrument /opt/sambaflow/apps/private/anl/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./uno_16_4_${NS}_ws/uno_16_4_${NS}_ws.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --data-dir /software/sambanova/dataset/CCLE_16_${NS} --epochs 1 > my.log 2>&1 + +srun python /opt/sambaflow/apps/private/anl/uno_full.py run --multiprocess-pickle --measure-spatial --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef=./out/uno_full_16_47_${NS}/uno_full_16_47_${NS}.pef --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE --lr 0.001 --data-dir /software/sambanova/dataset/CCLE_16_${NS} > pyinstrument_1.13.log 2>&1 + +cat my.log # Has pyinstrument run name. +pyinstrument --load-prev 2022-09-21T19-21-05 -r html + + +1.13 + +source /opt/sambaflow/venv/bin/activate +cd ~/tmp/uno_test/ +export UNO=. +export NS=500 +export OMP_NUM_THREADS=1 +export PATH=/opt/sambaflow/bin:$PATH +sntilestat +``` + +uno_pickl.sh + +```bash +#! /bin/bash -x +#set -e +source /opt/sambaflow/venv/bin/activate +SECONDS=0 +NS=${2} +UNO=/opt/sambaflow/apps/private/anl/ +DS="ALL" +DS="CCLE" + +BS=$((NS*16)) +export OMP_NUM_THREADS=16 + +echo "Model: UNO_SPA_TRN" +echo "Date: " $(date +%m/%d/%y) +echo "Time: " $(date +%H:%M) +if [ "${1}" == "convert" ] ; then +python3 ${UNO}/uno/uno_data_loaders_converted.py --in_dir /var/tmp/raw/ --out_dir /software/sambanova/dataset/${DS}_16_${NS} --batch-size ${BS} --train_sources ${DS} --file-write-frequency 10 + + +elif [ "${1}" == "compile" ] ; then + echo "COMPILE" + python ${UNO}/uno_full.py compile --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --mac-human-decision ${UNO}/samba_uno/human_decisions_spatial.json --pef-name="uno_16_4_${NS}" --mac-v1 + + +elif [ "${1}" == "run" ] ; then + echo "RUN ${DS}" + SF_RNT_NUMA_BIND=2 + #python ${UNO}/uno_full.py run --acc-test --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE + python ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --epochs 1 + #python ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" + +elif [ "${1}" == "pyinstrument" ] ; then + echo "RUN ${DS}" + SF_RNT_NUMA_BIND=2 + #python ${UNO}/uno_full.py run --acc-test --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE + pyinstrument ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --epochs 1 + #python ${UNO}/uno_full.py run --mac-v1 --multiprocess-pickle --use-pickle-train --train-samba-spatial -b 16 -mb 4 --num-spatial-batches ${NS} --lr 0.001 --mapping spatial --data-dir /software/sambanova/dataset/${DS}_16_${NS} --converted-pickle --train_sources ${DS} --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" + +elif [ "${1}" == "no_pickle" ] ; then + echo "no_pickle ${DS}" + SF_RNT_NUMA_BIND=2 + python ${UNO}/uno_full.py run --train-samba-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --in_dir /var/tmp/raw/ --mac-v1 --train_source CCLE + +elif [ "${1}" == "mp" ] ; then +echo "Duration: " $SECONDS + +elif [ "${1}" == "mp" ] ; then +echo "Duration: " $SECONDS +echo "PERF" +python uno_full.py measure-performance --measure-spatial --weight-sharing -b 16 -mb 4 --num-spatial-batches ${NS} --mapping spatial --pef="out/uno_16_4_${NS}/uno_16_4_${NS}.pef" --num-iterations 20 --mac-v1 +fi + +echo "Duration: " $SECONDS +``` + +```bash +./uno_pickl.sh compile 500 +./uno_pickl.sh run 500 +./uno_pickl.sh pyinstrument 500 +pyinstrument --load-prev 2022-09-22T18-31-24 -r html +stdout is a terminal, so saved profile output to /tmp/tmpeo5ehksn.html +cp /tmp/tmpeo5ehksn.html . +``` + +On dev terminal + +```bash +scp wilsonb@sambanova.alcf.anl.gov:tmp/uno_test/tmpeo5ehksn.html . +``` + +View in local browser. + +### Running + +Create a directory for your work. + +```bash +mkdir ~/sambatune +cd ~/sambatune +``` + +Create **small_vae.yaml** with the following content using your favorite editor. + +```yaml +app: /opt/sambaflow/apps/private/anl/moleculevae.py + +model-args: -b 128 --in-width 512 --in-height 512 + +compile-args: compile --plot --enable-conv-tiling --compiler-configs-file /opt/sambaflow/apps/private/anl/moleculevae/compiler_configs_conv.json --mac-v2 --mac-human-decision /opt/sambaflow/apps/private/anl/moleculevae/symmetric_human_decisions_tiled_v2.json + +run-args: --input-path /var/tmp/dataset/moleculevae/ras1_prot-pops.h5 --out-path ${HOME}/moleculevae_out --model-id 0 --epochs 10 + +env: + OMP_NUM_THREADS: 16 + SF_RNT_FSM_POLL_BUSY_WAIT: 1 + SF_RNT_DMA_POLL_BUSY_WAIT: 1 + CONVFUNC_DEBUG_RUN: 0 +``` + +Run the following example: + +```bash +sambatune small_vae.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run +``` + +Create **linear_net.yaml** with the following content using your favorite editor. + +```yaml +app: /opt/sambaflow/apps/micros/linear_net.py + +model-args: > + -b 1024 + -mb 64 + --in-features 8192 + --out-features 4096 + --repeat 128 + --inference + +compile-args: > + --n-chips 2 + --plot + +env: + SF_RNT_FSM_POLL_BUSY_WAIT: 1 + SF_RNT_DMA_POLL_BUSY_WAIT: 1 + CONVFUNC_DEBUG_RUN": 0 +``` + +**NOTE:** The following takes 45 minutes to run. + +Run the following example: + +```bash +sambatune linear_net.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run +``` + +```bash +#TODOBRW +cd ~/tmp/uno_test +screen +sambatune uno.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run +``` +where **linear_net.yaml** is a user-specified configuration file you created above. ## SambaTune UI @@ -248,7 +497,7 @@ Run ```bash ssh -N -f -L localhost:18576:localhost:18576 ALCFUserID@sambanova.alcf.anl.gov ... -Password: < MobilPass+ code > +Password: < MobilePass+ code > ssh ALCFUserID@sambanova.alcf.anl.gov ``` @@ -258,7 +507,7 @@ ssh ALCFUserID@sambanova.alcf.anl.gov ssh -v -N -f -L localhost:8580:localhost:8580 wilsonb@sambanova.alcf.anl.gov ssh -N -f -L localhost:8580:localhost:8580 wilsonb@sambanova.alcf.anl.gov ... -Password: < MobilPass+ code > +Password: < MobilePass+ code > ssh wilsonb@sambanova.alcf.anl.gov ``` diff --git a/mkdocs.yml b/mkdocs.yml index 65ebab065..128f7089c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -142,6 +142,7 @@ nav: - OpenMP: theta/programming-models/openmp-theta.md - Raja: theta/programming-models/raja.md - Debugging Tools: + - Overview: theta/debugging-tools/debugging-overview.md - Arm DDT: theta/debugging-tools/arm-ddt.md - ATP and STAT: theta/debugging-tools/atp-and-stat.md - GDB: theta/debugging-tools/gdb.md @@ -286,32 +287,10 @@ not_in_nav: | todo.md TODO.md notes.md - ai-testbed/howto-contribute.md - ai-testbed/cerebras/README.md - ai-testbed/cerebras/performance-tools.md - ai-testbed/graphcore/README.md - ai-testbed/graphcore/Scaling-ResNet50.md - ai-testbed/graphcore/cosmictagger-conversion.md - ai-testbed/graphcore/cosmictagger-ddp.md - ai-testbed/graphcore/multi-node-setup.md - ai-testbed/graphcore/profiling-mnist.md - ai-testbed/graphcore/profiling-resnet50.md - ai-testbed/graphcore/profiling.md - ai-testbed/sambanova_gen2/README.md - ai-testbed/sambanova_gen2/TODO.md - ai-testbed/sambanova_gen2/cosmictagger-conversion.md - ai-testbed/sambanova_gen2/performance-tools.md - ai-testbed/sambanova_gen2/running-GPT2.md - ai-testbed/sambanova_gen2/sambanova.md - ai-testbed/sambanova_gen2/sambatune-user-guide.md - ai-testbed/sambanova_gen2/not_published/running-GPT2-multi-node.md - ai-testbed/sambanova_gen2/not_published/running-bert-large-on-sn30.md - ai-testbed/sambanova_gen2/not_published/sambatune-user-guide.md running-jobs/gronkulator.md running-jobs/pbs-admin-quick-start-guide.md services/index.md theta/theta-decommissioning.md - theta/debugging-tools/debugging-overview.md theme: name: 'material' From 54a538bdeb693af12b237fd9db1be6420bb833e4 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 27 Jul 2023 18:37:19 -0500 Subject: [PATCH 06/10] Final cleanup of dead docs --- docs/running-jobs/gronkulator.md | 2 -- .../{ => unused}/pbs-admin-quick-start-guide.md | 0 docs/services/index.md | 6 ------ mkdocs.yml | 3 --- 4 files changed, 11 deletions(-) delete mode 100644 docs/running-jobs/gronkulator.md rename docs/running-jobs/{ => unused}/pbs-admin-quick-start-guide.md (100%) delete mode 100644 docs/services/index.md diff --git a/docs/running-jobs/gronkulator.md b/docs/running-jobs/gronkulator.md deleted file mode 100644 index 931000204..000000000 --- a/docs/running-jobs/gronkulator.md +++ /dev/null @@ -1,2 +0,0 @@ -# The Gronkulator: Job Status Display -Content is still being developed. Please check back. diff --git a/docs/running-jobs/pbs-admin-quick-start-guide.md b/docs/running-jobs/unused/pbs-admin-quick-start-guide.md similarity index 100% rename from docs/running-jobs/pbs-admin-quick-start-guide.md rename to docs/running-jobs/unused/pbs-admin-quick-start-guide.md diff --git a/docs/services/index.md b/docs/services/index.md deleted file mode 100644 index 448e5c966..000000000 --- a/docs/services/index.md +++ /dev/null @@ -1,6 +0,0 @@ -# ALCF Services - -Below is a list of some of the services ALCF makes availble for use across our HPC clusters. - -- [JupyterHub](jupyter-hub.md): An interactive computing environment for different languages. -- [Continuous Integration](continuous-integration.md): An automated processes to help preform build, test, package, and deploy activities. diff --git a/mkdocs.yml b/mkdocs.yml index 128f7089c..002380c08 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -287,9 +287,6 @@ not_in_nav: | todo.md TODO.md notes.md - running-jobs/gronkulator.md - running-jobs/pbs-admin-quick-start-guide.md - services/index.md theta/theta-decommissioning.md theme: From 049523d3b0c9d94c97845e602bd1308b6016ffec Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 27 Jul 2023 18:59:11 -0500 Subject: [PATCH 07/10] Restore Services top level .md --- docs/ai-testbed/{index.md => getting-started.md} | 0 docs/index.md | 4 ++-- docs/services/getting-started.md | 6 ++++++ mkdocs.yml | 6 ++++-- 4 files changed, 12 insertions(+), 4 deletions(-) rename docs/ai-testbed/{index.md => getting-started.md} (100%) create mode 100644 docs/services/getting-started.md diff --git a/docs/ai-testbed/index.md b/docs/ai-testbed/getting-started.md similarity index 100% rename from docs/ai-testbed/index.md rename to docs/ai-testbed/getting-started.md diff --git a/docs/index.md b/docs/index.md index e9a057d61..239f8e039 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,9 +9,9 @@ Our user guides contain information for: - [Theta](theta/hardware-overview/machine-overview.md): Information on how to use our Cray XC40/KNL supercomputer. - [ThetaGPU](theta-gpu/hardware-overview/theta-gpu-machine-overview.md): Information on how to use our NVIDIA DGX A100 supercomputer. - [Cooley](cooley/cooley-overview.md): Information on how to use our visualization cluster. -- [AI Testbed](https://docs.alcf.anl.gov/ai-testbed/): Information on how to use our AI Accelerators. +- [AI Testbed](ai-testbed/getting-started.md): Information on how to use our AI Accelerators. - [Aurora/Sunspot](https://www.alcf.anl.gov/support-center/aurora-sunspot): Information on getting your code ready for our upcoming exacale supercomputer. -- [Services](services/index.md): Information on how to use various services provided across clusters. +- [Services](services/getting-started.md): Information on how to use various services provided across clusters. - [Facility Policies](policies/facility-policies.md): Information on our policies and procedures. ## How to Get Access diff --git a/docs/services/getting-started.md b/docs/services/getting-started.md new file mode 100644 index 000000000..448e5c966 --- /dev/null +++ b/docs/services/getting-started.md @@ -0,0 +1,6 @@ +# ALCF Services + +Below is a list of some of the services ALCF makes availble for use across our HPC clusters. + +- [JupyterHub](jupyter-hub.md): An interactive computing environment for different languages. +- [Continuous Integration](continuous-integration.md): An automated processes to help preform build, test, package, and deploy activities. diff --git a/mkdocs.yml b/mkdocs.yml index 002380c08..9ba17bb3f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,7 +25,9 @@ nav: - Data Storage: data-management/filesystem-and-storage/data-storage.md - HPSS: data-management/filesystem-and-storage/hpss.md - Disk Quota: data-management/filesystem-and-storage/disk-quota.md - - Services: # services/index.md # Cant directly link to this in the nav sidebar, since it is a dropdown. Only linked to in base docs/index.md + - Services: # services/index.md # Cant directly link to this in the nav sidebar, since + # it is a dropdown. Only linked to in base docs/index.md + - Getting Started: services/getting-started.md - JupyterHub: services/jupyter-hub.md - Continuous Integration: - General: services/continuous-integration.md @@ -189,7 +191,7 @@ nav: - Darshan: theta-gpu/performance-tools/darshan.md - NVIDIA Nsight: theta-gpu/performance-tools/nvidia-nsight.md - AI Testbed: - - Getting Started: ai-testbed/index.md + - Getting Started: ai-testbed/getting-started.md - Cerebras: - System Overview: ai-testbed/cerebras/system-overview.md - Getting Started: ai-testbed/cerebras/getting-started.md From b85b106a227540e3c2a94c0f9e37335df26755d5 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 27 Jul 2023 19:04:01 -0500 Subject: [PATCH 08/10] Add --strict to livesite deployment Bump GH Actions versions https://github.blog/changelog/2023-06-13-github-actions-all-actions-will-run-on-node16-instead-of-node12-by-default/ --- .github/workflows/update-livesite.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/update-livesite.yml b/.github/workflows/update-livesite.yml index 1088f8150..eeaabb013 100644 --- a/.github/workflows/update-livesite.yml +++ b/.github/workflows/update-livesite.yml @@ -8,11 +8,11 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.x - run: pip install -r requirements.txt - - run: mkdocs gh-deploy --force + - run: mkdocs gh-deploy --force --strict From a8436b2d2c8c56257dd1c2bc0860b388baf99327 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Thu, 27 Jul 2023 23:34:30 -0500 Subject: [PATCH 09/10] Fix links --- .../allocation-management.md | 2 +- .../project-management/starting-alcf-award.md | 2 +- .../graphcore/unused/Scaling-ResNet50.md | 8 +- .../{ => unused}/files/benchmarks.yml | 0 .../graphcore/unused/profiling-mnist.md | 8 +- .../graphcore/unused/profiling-resnet50.md | 6 +- .../sambanova_gen1/unused/sambanova.md | 2 +- .../example-multi-node-programs.md | 4 +- .../sambanova_gen2/unused/sambanova.md | 110 ------------------ .../compiling-and-linking-overview.md | 2 +- docs/running-jobs/example-job-scripts.md | 2 +- docs/running-jobs/job-and-queue-scheduling.md | 2 +- docs/services/jenkins.md | 2 +- docs/stylesheets/alcf-extra.css | 9 ++ .../building-python-packages.md | 2 +- .../job-and-queue-scheduling.md | 2 +- docs/theta/data-science-workflows/keras.md | 28 ++--- docs/theta/performance-tools/intel-advisor.md | 2 +- docs/theta/programming-models/openmp-theta.md | 8 +- .../affinity-theta.md | 2 +- mkdocs.yml | 4 + 21 files changed, 54 insertions(+), 153 deletions(-) rename docs/ai-testbed/graphcore/{ => unused}/files/benchmarks.yml (100%) delete mode 100644 docs/ai-testbed/sambanova_gen2/unused/sambanova.md diff --git a/docs/account-project-management/allocation-management/allocation-management.md b/docs/account-project-management/allocation-management/allocation-management.md index d7450ef86..64f7c5252 100644 --- a/docs/account-project-management/allocation-management/allocation-management.md +++ b/docs/account-project-management/allocation-management/allocation-management.md @@ -2,7 +2,7 @@ Allocations require management – balance checks, resource allocation, requesting more time, etc. ## Checking for an Active Allocation -To determine if there is an active allocation, check [Job Submision](../../../theta/queueing-and-running-jobs/job-and-queue-scheduling/#submit-a-job). +To determine if there is an active allocation, check [Job Submission](../../theta/queueing-and-running-jobs/job-and-queue-scheduling.md#submit-a-job). For information on how to run the query, look at our documentation on our [sbank Allocations Accounting System](sbank-allocation-accounting-system.md) or email [support@alcf.anl.gov](mailto:support@alcf.anl.gov) and ask for all active allocations. diff --git a/docs/account-project-management/project-management/starting-alcf-award.md b/docs/account-project-management/project-management/starting-alcf-award.md index 0ef6b2543..59fc9f991 100644 --- a/docs/account-project-management/project-management/starting-alcf-award.md +++ b/docs/account-project-management/project-management/starting-alcf-award.md @@ -106,7 +106,7 @@ The ALCF will send you a report template at the end of each quarter. Please comp Please be aware that we will periodically monitor, and could potentially adjust, your project allocation if a large portion of it goes unused. You may view: [Pullback Policy](../../policies/queue-scheduling/pullback-policy.md) ### Allocation Overburn Policy -Please see this page for overburn/overuse eligibility for INCITE projects that have exhausted their allocation in the first 11 months of its allocation year: [Allocation Overburn](../../../policies/queue-scheduling/queue-and-scheduling-policy/#incitealcc-overburn-policy) +Please see this page for overburn/overuse eligibility for INCITE projects that have exhausted their allocation in the first 11 months of its allocation year: [Allocation Overburn](../../policies/queue-scheduling/queue-and-scheduling-policy.md#incitealcc-overburn-policy) ### Acknowledgment In Publications Please follow the guidelines provided on the [ALCF Acknowledgement Policy page](../../policies/alcf-acknowledgement-policy.md) to properly acknowledge the use of ALCF resources in all of your publications, both online and print. diff --git a/docs/ai-testbed/graphcore/unused/Scaling-ResNet50.md b/docs/ai-testbed/graphcore/unused/Scaling-ResNet50.md index f462c7081..39eae342d 100644 --- a/docs/ai-testbed/graphcore/unused/Scaling-ResNet50.md +++ b/docs/ai-testbed/graphcore/unused/Scaling-ResNet50.md @@ -1,6 +1,6 @@ # Scaling ResNet50 -Follow all the instructions in [Getting Started](/docs/graphcore/Getting-Started) to log into a Graphcore node. +Follow all the instructions in [Getting Started](../getting-started.md) to log into a Graphcore node. ## Examples Repo @@ -131,12 +131,12 @@ You should see: # gc-poplar-04:22 SSH-2.0-OpenSSH_8.2p1 Ubuntu-4ubuntu0.5 ``` -## Benchmarks.yml +## `benchmarks.yml` Update **${HOME}/graphcore/examples/vision/cnns/pytorch/train/benchmarks.yml** -with your favorite editor to match [benchmarks.yml](/docs/graphcore/benchmarks.yml). +with your favorite editor to match [benchmarks.yml](./files/benchmarks.yml). -## Configs.yml +## `configs.yml` Update **${HOME}/graphcore/examples/vision/cnns/pytorch/train/configs.yml** with your favorite editor. At about line 30, change **use_bbox_info: true** to diff --git a/docs/ai-testbed/graphcore/files/benchmarks.yml b/docs/ai-testbed/graphcore/unused/files/benchmarks.yml similarity index 100% rename from docs/ai-testbed/graphcore/files/benchmarks.yml rename to docs/ai-testbed/graphcore/unused/files/benchmarks.yml diff --git a/docs/ai-testbed/graphcore/unused/profiling-mnist.md b/docs/ai-testbed/graphcore/unused/profiling-mnist.md index 26fdb4169..5496c9b2f 100644 --- a/docs/ai-testbed/graphcore/unused/profiling-mnist.md +++ b/docs/ai-testbed/graphcore/unused/profiling-mnist.md @@ -1,10 +1,10 @@ # Profiling MNIST -Follow all the instructions in [Getting Started](/docs/graphcore/Getting-Started) to log into a Graphcore node. +Follow all the instructions in [Getting Started](../getting-started.md) to log into a Graphcore node. -Follow the instructions in [Virtual Environments](/docs/graphcore/Virtual-Environments) up to and including **PopART Environment Setup**. +Follow the instructions in [Virtual Environments](../virtual-environments.md) up to and including **PopART Environment Setup**. -Following the instructions in [Example Programs](/docs/graphcore/Example-Programs) up to and including +Following the instructions in [Example Programs](../example-programs.md) up to and including **MNIST, Install Requirements**. ## Change Directory @@ -33,4 +33,4 @@ Do so by running the following command: python mnist_poptorch.py ``` -When MNIST has finished running, see [Profiling](/docs/graphcore/Profiling) to use **Graph Analyser**. +When MNIST has finished running, see [Profiling](./profiling.md) to use **Graph Analyser**. diff --git a/docs/ai-testbed/graphcore/unused/profiling-resnet50.md b/docs/ai-testbed/graphcore/unused/profiling-resnet50.md index 201ae1a1c..1eca7f505 100644 --- a/docs/ai-testbed/graphcore/unused/profiling-resnet50.md +++ b/docs/ai-testbed/graphcore/unused/profiling-resnet50.md @@ -1,8 +1,8 @@ # Profiling ResNet50 -Follow all the instructions in [Getting Started](/docs/graphcore/Getting-Started) to log into a Graphcore node. +Follow all the instructions in [Getting Started](../getting-started.md) to log into a Graphcore node. -Follow the instructions in [Virtual Environments](/docs/graphcore/Virtual-Environments) up to and including **PopART Environment Setup**. +Follow the instructions in [Virtual Environments](../virtual-environments.md) up to and including **PopART Environment Setup**. ## Examples Repo @@ -58,4 +58,4 @@ python3 -m examples_utils benchmark --spec benchmarks.yml --benchmark pytorch_re ## Profile Results -When ResNet50 has finished running, see [Profiling](/docs/graphcore/Profiling) to use **Graph Analyser**. +When ResNet50 has finished running, see [Profiling](./profiling.md) to use **Graph Analyser**. diff --git a/docs/ai-testbed/sambanova_gen1/unused/sambanova.md b/docs/ai-testbed/sambanova_gen1/unused/sambanova.md index 4114d6d7a..1d7257119 100644 --- a/docs/ai-testbed/sambanova_gen1/unused/sambanova.md +++ b/docs/ai-testbed/sambanova_gen1/unused/sambanova.md @@ -36,7 +36,7 @@ broken (503 errors). ## Further Information -[Human Decisions Files notes](/display/AI/Human+Decisions+Files+notes) + ## Creating a SambaNova Portal Account to access the documentation portal diff --git a/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md b/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md index 1cd3a6c06..d1c8f82c2 100644 --- a/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md +++ b/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md @@ -1,8 +1,6 @@ # Example Multi-Node Programs -In this section we will learn how to extend the UNet2d and Gpt1.5B applications scripts that we introduced in the [Example Programs](/docs/ai-testbed/sambanova_gen2/example-programs.md) to compile and run multiple instances of the model in a data parallel fashion across multiple tiles or across multiple nodes. - - +In this section we will learn how to extend the UNet2d and Gpt1.5B applications scripts that we introduced in the [Example Programs](./example-programs.md) to compile and run multiple instances of the model in a data parallel fashion across multiple tiles or across multiple nodes. ## UNet2d diff --git a/docs/ai-testbed/sambanova_gen2/unused/sambanova.md b/docs/ai-testbed/sambanova_gen2/unused/sambanova.md deleted file mode 100644 index 4114d6d7a..000000000 --- a/docs/ai-testbed/sambanova_gen2/unused/sambanova.md +++ /dev/null @@ -1,110 +0,0 @@ -# SambaNova - -## PyTorch Mirrors - -See . - -There are two mirrors (in the python docs) used for downloading the -mnist dataset. - -mirrors = [ - 'http://yann.lecun.com/exdb/mnist/', - 'https://ossci-datasets.s3.amazonaws.com/mnist/'] - -[yann.lecun.com](http://yann.lecun.com) appears to be intermittently -broken (503 errors). - -## Resources - -- - -- [Argonne SambaNova Training - 11/20](https://anl.app.box.com/s/bqc101mvt3r7rpxbd2yxjsf623ea3gpe) - -- [https://docs.sambanova.ai](https://docs.sambanova.ai/) Create a - SambaNova account if you do not have one. - -- [Getting Started with - SambaFlow](https://docs.sambanova.ai/sambanova-docs/1.6/developer/getting-started.html) - Skip this one. - -- [Tutorial: Creating Models with - SambaFlow](https://docs.sambanova.ai/sambanova-docs/1.6/developer/intro-tutorial.html) - -- Administrators --- @ryade - -## Further Information - -[Human Decisions Files notes](/display/AI/Human+Decisions+Files+notes) - -## Creating a SambaNova Portal Account to access the documentation portal - -1. Go to [login.sambanova.ai](http://login.sambanova.ai/); - -2. Select the "Sign up" link at the bottom; - -3. Enter your information - - 1. Your ANL email address; - - 2. A password that you choose to access the site; - - 3. First name; - - 4. Last name; - - 5. Alternate email address; - - 6. Use 64693137 for the CLOUD ID; - - 7. Select "Register" button; - - 8. Note: The new web page may be displaying a QR code. Do not navigate away from it. Please edit this page to describe what -happenes for you. - -4. Verify your email address - - 1. Open your ANL email; - - 2. Open the email from Okta; - - 3. Select the "Activate Account" button; - - 4. Select the "Configure factor" button on the displayed web page; - - 5. Select either iPhone or Android for the device time on the new web page; - - 6. Install Okta Verify from the App Store/Google Play Store onto your mobile device.; - - 7. Select "Next" button on the web page; - -5. On your phone - - 1. Open Okta Verify app; - - 2. Select "Get Started" button; - - 3. Select "Next" button; - - 4. Select "Add Account" button; - - 5. Select "Organization" for Account Type; - - 6. Scan the QR Code shown in the browser; - -6. Sign in to the SambaNova web site - - 1. Select the "SambaNova Documentation" button. - -Authorization for sections of the SambaNova site uses the tuple (email -address, cloud id). For ANL users, th*ese **should** be an anl email -address and the cloud id specified above (64693137). (Note: the cloud -id can be changed in the SambaNova user settings.) -**If you are not at Argonne, please send us an email (ai@alcf.anl.gov) -for access. ** - -If you plan to publish, say to a conference, workshop or journal, we -have a review process wherein you share the draft with us -(pre-submission) at and we -will work with SambaNova for the requisite approvals. diff --git a/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md b/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md index 4e607cd2f..e756aa315 100644 --- a/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md +++ b/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md @@ -2,7 +2,7 @@ ## Compiling on Polaris Login and Compute Nodes -If your build system does not require GPUs for the build process, as is usually the case, compilation of GPU-accelerated codes is generally expected to work well on the Polaris login nodes. If your build system _does_ require GPUs, you cannot yet compile on the Polaris login nodes, as they do not currently have GPUs installed. You may in this case compile your applications on the Polaris compute nodes. Do this by submitting an [interactive single-node job](/polaris/running-jobs#Interactive-Jobs-on-Compute-Nodes), or running your build system in a batch job. +If your build system does not require GPUs for the build process, as is usually the case, compilation of GPU-accelerated codes is generally expected to work well on the Polaris login nodes. If your build system _does_ require GPUs, you cannot yet compile on the Polaris login nodes, as they do not currently have GPUs installed. You may in this case compile your applications on the Polaris compute nodes. Do this by submitting an [interactive single-node job](../running-jobs.md#Interactive-Jobs-on-Compute-Nodes), or running your build system in a batch job. diff --git a/docs/running-jobs/example-job-scripts.md b/docs/running-jobs/example-job-scripts.md index 29c8111a0..e48dd4cda 100644 --- a/docs/running-jobs/example-job-scripts.md +++ b/docs/running-jobs/example-job-scripts.md @@ -7,7 +7,7 @@ A simple example using a similar script on Polaris is available in the ## CPU MPI-OpenMP Examples -The following `submit.sh` example submits a 1-node job to Polaris with 16 MPI ranks per node and 2 OpenMP threads per rank. See [Queues](./job-and-queue-scheduling/#queues) for details on practical limits to node counts and job times for different sizes of jobs. +The following `submit.sh` example submits a 1-node job to Polaris with 16 MPI ranks per node and 2 OpenMP threads per rank. See [Queues](./job-and-queue-scheduling.md#queues) for details on practical limits to node counts and job times for different sizes of jobs. The [`hello_affinity`](https://github.com/argonne-lcf/GettingStarted/tree/master/Examples/Polaris/affinity_gpu) program is a compiled C++ code, which is built via `make -f Makefile.nvhpc` in the linked directory after cloning the [Getting Started](https://github.com/argonne-lcf/GettingStarted) repository. diff --git a/docs/running-jobs/job-and-queue-scheduling.md b/docs/running-jobs/job-and-queue-scheduling.md index 65545595d..9c9902203 100644 --- a/docs/running-jobs/job-and-queue-scheduling.md +++ b/docs/running-jobs/job-and-queue-scheduling.md @@ -89,7 +89,7 @@ Where: * `walltime=HH:MM:SS` specifying a wall time is mandatory at the ALCF. Valid wall times depend on the queue you are using. There is a table with the queues for each machine at the end of this section and in the machine specific documentation. * `filesystems=fs1:fs2:...` Specifying which filesystems your application uses is mandatory at ALCF. The reason for this is if a filesystem goes down, we have a way of making PBS aware of that and it won't run jobs that need that filesystem. If you don't specify filesystems you will receive the following error: `qsub: Resource: filesystems is required to be set.` * `place=scatter` is telling PBS you want each of your chunks on a separate vnode. By default, PBS will pack your chunks to get maximum utilization. If you requested `ncpus=1` and `chunks=64` **without** `place=scatter` on a system with `ncpus=64`, all your chunks would end up on one node. -* Your job script: See [Example Job Scripts](../example-job-scripts) for more information about how to build your job script. For options that wont change, you do have the option of taking things off the command line and putting them in your job script. For instance the above command line could be simplified to `qsub -l select=<#> ` if you added the following to the top (the PBS directives have to be before any executable line) of your job script: +* Your job script: See [Example Job Scripts](./example-job-scripts.md) for more information about how to build your job script. For options that wont change, you do have the option of taking things off the command line and putting them in your job script. For instance the above command line could be simplified to `qsub -l select=<#> ` if you added the following to the top (the PBS directives have to be before any executable line) of your job script: ```bash #PBS -A diff --git a/docs/services/jenkins.md b/docs/services/jenkins.md index 41f2343bb..5e98e0f16 100644 --- a/docs/services/jenkins.md +++ b/docs/services/jenkins.md @@ -1,7 +1,7 @@ # Jenkins on Theta ## Jenkins to be decommissioned -New projects should request access to use our GitLab-CI-based service. You can learn how to request access in our documentation found [here](/services/gitlab-ci/#quickstart). +New projects should request access to use our GitLab-CI-based service. You can learn how to request access in our documentation found [here](./gitlab-ci.md#quickstart). Existing projects can continue to use Jenkins. We will notify projects when we have the date it will be retired. Projects will have ample notice to migrate their work to our GitLab-CI service. diff --git a/docs/stylesheets/alcf-extra.css b/docs/stylesheets/alcf-extra.css index 589064f0e..6c4b23aed 100644 --- a/docs/stylesheets/alcf-extra.css +++ b/docs/stylesheets/alcf-extra.css @@ -913,3 +913,12 @@ footer a:hover { .js-dropdown-hidden { display: none; } + +table { + table-layout: fixed; + max-width: 100%; +} + +.md-typeset code { + overflow-wrap: break-word; +} diff --git a/docs/theta-gpu/data-science-workflows/building-python-packages.md b/docs/theta-gpu/data-science-workflows/building-python-packages.md index 6fb47e406..df42771c2 100644 --- a/docs/theta-gpu/data-science-workflows/building-python-packages.md +++ b/docs/theta-gpu/data-science-workflows/building-python-packages.md @@ -4,7 +4,7 @@ To build Python packages for ThetaGPU, there are two options: build on top of a ## Build on ThetaGPU compute using Conda To build on ThetaGPU compute and install your own packages, login to theta and then submit an interactive job to log on to ThetaGPU compute node. -Please see [Running PyTorch with Conda](/dl-frameworks/running-pytorch-conda.md) or [Running TensorFlow with Conda](/dl-frameworks/running-tensorflow-conda/index.html) for more information. +Please see [Running PyTorch with Conda](./dl-frameworks/running-pytorch-conda.md) or [Running TensorFlow with Conda](./dl-frameworks/running-tensorflow-conda.md) for more information. ## Building on top of a container At the moment, you will need two shells to do this: have one open on a login node (for example, ```thetaloginN```, and one open on a compute node (```thetagpuN```). First, start the container in interactive mode: diff --git a/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md b/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md index 1471a22ce..bc6d76b5d 100644 --- a/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md +++ b/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md @@ -13,7 +13,7 @@ As with all Argonne Leadership Computing Facility production systems, job priori * job duration - shorter duration jobs will accumulate priority more quickly, so it is best to specify the job run time as accurately as possible ### Reservations and Scheduling Policy -Some work will require use of Theta that requires deviation from regular policy. On such occasions, normal reservation policy applies. Please send the [regular form](/docs/theta/queueing-and-running-jobs/machine-reservations.md) no fewer than five (5) business days in advance. +Some work will require use of Theta that requires deviation from regular policy. On such occasions, normal reservation policy applies. Please send the [regular form](../../theta/queueing-and-running-jobs/machine-reservations.md) no fewer than five (5) business days in advance. ### Monday Maintenance When the ALCF is on a regular business schedule, preventitive maintenance is typically scheduled on alternate Mondays. The showres command may be used to view pending and active maintenance reservations. diff --git a/docs/theta/data-science-workflows/keras.md b/docs/theta/data-science-workflows/keras.md index ba0806350..f9ba8cee4 100644 --- a/docs/theta/data-science-workflows/keras.md +++ b/docs/theta/data-science-workflows/keras.md @@ -7,26 +7,26 @@ On Theta, we support TensorFlow backend for Keras. To use the datascience Keras module on Theta, please load the following two modules: ``` module load datascience/keras-2.2.4 - + module load datascience/tensorflow-1.12 ``` -Notice that the datascience/tensorflow-* modules were compiled with AVX512 extension on Theta. Therefore, it could not run on login node, otherwise it will issue an “illegal instruction” fault. One has to submit the job to KNL nodes (see TensorFlow documentation for details). +Notice that the `datascience/tensorflow-*` modules were compiled with AVX512 extension on Theta. Therefore, it could not run on login node, otherwise it will issue an "illegal instruction" fault. One has to submit the job to KNL nodes (see TensorFlow documentation for details). -Since we use TensorFlow as the backend, all the optimal environmental setups (Threading + affinity) are applicable here. Please visit the [Tensorflow documentation page](tensorflow) for the optimal setting. +Since we use TensorFlow as the backend, all the optimal environmental setups (Threading + affinity) are applicable here. Please visit the [TensorFlow documentation page](tensorflow.md) for the optimal setting. -We do not see any incompatibility issues in using different versions of keras and tensorflow as those specified above. Feel free to change other versions of keras or TensorFlow. Currently, we support version 2.2.2 and 2.2.4. +We do not see any incompatibility issues in using different versions of keras and tensorflow as those specified above. Feel free to change other versions of Keras or TensorFlow. Currently, we support version 2.2.2 and 2.2.4. ## Distributed learning using Horovod We support distributed learning using Horovod. To use it please load datascience/horovod-0.15.2 module. Please change your python script accordingly -### Initialize Horovod by adding the following lines to the beginning of your python script. +### Initialize Horovod by adding the following lines to the beginning of your Python script. ``` import horovod.keras as hvd - + hvd.init() ``` -After this initialization, the total number of ranks and the rank id could be access through hvd.rank(), hvd.size() functions. +After this initialization, the total number of ranks and the rank id could be access through hvd.rank(), hvd.size() functions. ## Scale the learning rate. Typically, since we use multiple workers, the global batch is usually increased n times (n is the number of workers). The learning rate should increase proportionally as follows (assuming that the learning rate initially is 0.01). @@ -34,19 +34,19 @@ Typically, since we use multiple workers, the global batch is usually increased ``` opt = keras.optimizers.Adadelta(1.0 * hvd.size() ``` -In some case, 0.01*hvd.size() might be too large, one might want to have some warming up steps with smaller learning rate. +In some case, `0.01*hvd.size()` might be too large, one might want to have some warming up steps with smaller learning rate. ### Wrap the optimizer with Distributed Optimizer ``` opt = hvd.DistributedOptimizer(opt) ``` -In such case, opt will automatically average the loss and gradients among all the workers and then perform update. +In such case, opt will automatically average the loss and gradients among all the workers and then perform update. ### Broadcast the model from rank 0, so that all the workers will have the same starting point ``` callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] ``` -Notice that by default, TensorFlow will initialize the parameters randomly. Therefore, by default, different workers will have different parameters. So it is crucial to broadcast the model from rank 0 to other ranks. +Notice that by default, TensorFlow will initialize the parameters randomly. Therefore, by default, different workers will have different parameters. So it is crucial to broadcast the model from rank 0 to other ranks. ### Letting only rank 0 to write checkpoint ``` @@ -55,12 +55,12 @@ if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) ### Loading data according to rank ID -Since we are using data parallel scheme. Different ranks shall process different data. One has to change the data loader part of the python script to ensure different ranks read different mini batches of data. +Since we are using data parallel scheme. Different ranks shall process different data. One has to change the data loader part of the python script to ensure different ranks read different mini batches of data. #### Example A simple example for doing linear regression using Keras + Horovod is put in the follwoing directory on Theta: -/projects/SDL_Workshop/hzheng/examples/keras/linreg - -linreg_keras.py is the python script, and qsub.sc is the COBALT submission script. +/projects/SDL_Workshop/hzheng/examples/keras/linreg + +linreg_keras.py is the python script, and qsub.sc is the COBALT submission script. diff --git a/docs/theta/performance-tools/intel-advisor.md b/docs/theta/performance-tools/intel-advisor.md index 0a9a5aca9..ba7808674 100644 --- a/docs/theta/performance-tools/intel-advisor.md +++ b/docs/theta/performance-tools/intel-advisor.md @@ -98,6 +98,6 @@ There are three other types of collections that can be performed with Advisor fo ## Additional Information: There are many command line options. See [2] for more details on all of the options, and its more comprehensive user guide also available on Intel’s website. - [1] Williams, Samuel, Andrew Waterman, and David Patterson. "Roofline: an insightful visual performance model for multicore architectures." Communications of the ACM 52.4 (2009): 65-76. -- [2] Intel. “Get Started with Intel Advisor.” Intel® Software, Intel, 18 Oct. 2018, [software.intel.com/en-us/get-started-with-advisor-for-more-information](software.intel.com/en-us/get-started-with-advisor-for-more-information) +- [2] Intel. “Get Started with Intel Advisor.” Intel® Software, Intel, 18 Oct. 2018 diff --git a/docs/theta/programming-models/openmp-theta.md b/docs/theta/programming-models/openmp-theta.md index bc16757f3..25dcd29a3 100644 --- a/docs/theta/programming-models/openmp-theta.md +++ b/docs/theta/programming-models/openmp-theta.md @@ -21,7 +21,7 @@ To enable OpenMP, use the following flags in your compile line, depending on the | LLVM | PrgEnv-llvm | “-fopenmp" | ## Running Jobs with OpenMP on Theta -To run jobs on Theta with OpenMP threads, the OpenMP environment variable OMP_NUM_THREADS will need to be set to the desired number of threads per MPI rank, and certain flags in the aprun command will need to be set. Some examples are given below, and more information about running is here: [Affinity on Theta](/docs/theta/queueing-and-running-jobs/affinity-theta.md). +To run jobs on Theta with OpenMP threads, the OpenMP environment variable `OMP_NUM_THREADS` will need to be set to the desired number of threads per MPI rank, and certain flags in the aprun command will need to be set. Some examples are given below, and more information about running is here: [Affinity on Theta](../queueing-and-running-jobs/affinity-theta.md). ### Source code for xthi.c: ``` @@ -94,11 +94,11 @@ int main(int argc, char *argv[]) $ cc -qopenmp xthi.c -o xthi # PrgEnv-intel ``` -2. Run with aprun (either in a batch script that is submitted to the job scheduler a or on the command line as part of an interactive session. See [job scheduling](/docs/theta/queueing-and-running-jobs/job-and-queue-scheduling.md) for more details about how to run. +2. Run with `aprun` (either in a batch script that is submitted to the job scheduler a or on the command line as part of an interactive session. See [job scheduling](../queueing-and-running-jobs/job-and-queue-scheduling.md) for more details about how to run. Mapping of OpenMP threads to hardware threads on a KNL node can be achieved with the “--cc” option in aprun. -One common option described in more detail on [Affinity on Theta](/docs/theta/queueing-and-running-jobs/affinity-theta.md) is to use --cc depth with the -d and -j flags: +One common option described in more detail on [Affinity on Theta](../queueing-and-running-jobs/affinity-theta.md) is to use `--cc depth` with the `-d` and `-j` flags: ``` $ aprun -n 1 -N 1 -d 8 -j 1 -cc depth -e OMP_NUM_THREADS=8 ./a.out @@ -113,7 +113,7 @@ Hello from rank 0, thread 6, on nid03554. (core affinity = 6) Application 19165961 resources: utime ~1s, stime ~1s, Rss ~6284, inblocks ~0, outblocks ~8 ``` -Another option is to use --cc none with OpenMP affinity controls: +Another option is to use `--cc none` with OpenMP affinity controls: ``` $ aprun -n 1 -N 1 -cc none -e OMP_NUM_THREADS=8 -e OMP_PROC_BIND=spread -e OMP_PLACES=cores ./a.out diff --git a/docs/theta/queueing-and-running-jobs/affinity-theta.md b/docs/theta/queueing-and-running-jobs/affinity-theta.md index a7566c782..d6f58ae7e 100644 --- a/docs/theta/queueing-and-running-jobs/affinity-theta.md +++ b/docs/theta/queueing-and-running-jobs/affinity-theta.md @@ -12,7 +12,7 @@ The numbers inside the quadrants identify the specific hardware threads in the c
Visual representation of two tiles in a KNL
-Using the -j, -d, and --cc arguments to aprun and environment variables, MPI ranks and threads can be assigned to run on specific hardware threads. For more information about the flags to aprun, see [Running Jobs on Theta](/docs/theta/queueing-and-running-jobs/job-and-queue-scheduling.md). Four examples of using aprun are given below followed by descriptions of two methods for displaying the mapping produced. +Using the -j, -d, and --cc arguments to aprun and environment variables, MPI ranks and threads can be assigned to run on specific hardware threads. For more information about the flags to aprun, see [Running Jobs on Theta](../queueing-and-running-jobs/job-and-queue-scheduling.md). Four examples of using aprun are given below followed by descriptions of two methods for displaying the mapping produced. **Note:** Logical core and hardware thread are used interchangeably below. diff --git a/mkdocs.yml b/mkdocs.yml index 9ba17bb3f..90a6f2ed0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -344,6 +344,10 @@ markdown_extensions: - pymdownx.tasklist: custom_checkbox: true +validation: + omitted_files: warn + absolute_links: warn + unrecognized_links: warn repo_name: 'argonne-lcf/alcf-userguide' repo_url: 'https://github.com/argonne-lcf/alcf-userguide' From 39bb8052f335be65bff906bfce54ad1fb5e7f9f3 Mon Sep 17 00:00:00 2001 From: Kyle Gerard Felker Date: Fri, 28 Jul 2023 16:51:57 -0500 Subject: [PATCH 10/10] Add new GH Action workflow for testing PRs and commits to master --- .github/workflows/test-mkdocs-build.yml | 20 ++++++++++++++++++++ Makefile | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test-mkdocs-build.yml diff --git a/.github/workflows/test-mkdocs-build.yml b/.github/workflows/test-mkdocs-build.yml new file mode 100644 index 000000000..2c93a3895 --- /dev/null +++ b/.github/workflows/test-mkdocs-build.yml @@ -0,0 +1,20 @@ +name: update-livesite +on: + push: + branches: + - main + pull_request: + branches: [ master ] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.x + - run: pip install -r requirements.txt + - run: mkdocs build --strict diff --git a/Makefile b/Makefile index 70dfb25ca..d63e4c35b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ install-dev: .PHONY: build-docs build-docs: - mkdocs build + mkdocs build --strict .PHONY: all