diff --git a/.github/workflows/test-mkdocs-build.yml b/.github/workflows/test-mkdocs-build.yml new file mode 100644 index 000000000..2c93a3895 --- /dev/null +++ b/.github/workflows/test-mkdocs-build.yml @@ -0,0 +1,20 @@ +name: update-livesite +on: + push: + branches: + - main + pull_request: + branches: [ master ] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: recursive + - uses: actions/setup-python@v4 + with: + python-version: 3.x + - run: pip install -r requirements.txt + - run: mkdocs build --strict diff --git a/.github/workflows/update-livesite.yml b/.github/workflows/update-livesite.yml index 1088f8150..eeaabb013 100644 --- a/.github/workflows/update-livesite.yml +++ b/.github/workflows/update-livesite.yml @@ -8,11 +8,11 @@ jobs: deploy: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: recursive - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.x - run: pip install -r requirements.txt - - run: mkdocs gh-deploy --force + - run: mkdocs gh-deploy --force --strict diff --git a/Makefile b/Makefile index 70dfb25ca..d63e4c35b 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ install-dev: .PHONY: build-docs build-docs: - mkdocs build + mkdocs build --strict .PHONY: all diff --git a/docs/account-project-management/allocation-management/allocation-management.md b/docs/account-project-management/allocation-management/allocation-management.md index d41d69d61..64f7c5252 100644 --- a/docs/account-project-management/allocation-management/allocation-management.md +++ b/docs/account-project-management/allocation-management/allocation-management.md @@ -2,14 +2,14 @@ Allocations require management – balance checks, resource allocation, requesting more time, etc. ## Checking for an Active Allocation -To determine if there is an active allocation, check [Job Submision](../../../theta/queueing-and-running-jobs/job-and-queue-scheduling/#submit-a-job). +To determine if there is an active allocation, check [Job Submission](../../theta/queueing-and-running-jobs/job-and-queue-scheduling.md#submit-a-job). For information on how to run the query, look at our documentation on our [sbank Allocations Accounting System](sbank-allocation-accounting-system.md) or email [support@alcf.anl.gov](mailto:support@alcf.anl.gov) and ask for all active allocations. ## Using sbank to Determine the Balance of an Allocation To determine which platforms have an active balance, check our allocation accounting system [sbank](sbank-allocation-accounting-system.md). -- To obtain the allocation balance, check the sbank command [sbank-list-allocations](sbank-list-allocations.md). +- To obtain the allocation balance, check the sbank command [sbank-list-allocations](not_in_nav/sbank-list-allocations.md). - DD projects with a negative balance will not be able to run jobs until they have requested additional time, see Getting more time below. - INCITE and ALCC PIs automatically email a summary of project usage. If this is a DD project, please email [support@alcf.anl.gov](mailto:support@alcf.anl.gov). diff --git a/docs/account-project-management/allocation-management/sbank-detail-allocations.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-allocations.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-allocations.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-allocations.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-jobs.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-jobs.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-jobs.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-jobs.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-projects.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-projects.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-projects.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-projects.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-transactions.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-transactions.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-transactions.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-transactions.md diff --git a/docs/account-project-management/allocation-management/sbank-detail-users.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail-users.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail-users.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail-users.md diff --git a/docs/account-project-management/allocation-management/sbank-detail.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-detail.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-detail.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-detail.md diff --git a/docs/account-project-management/allocation-management/sbank-examples.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-examples.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-examples.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-examples.md diff --git a/docs/account-project-management/allocation-management/sbank-list-allocations.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-allocations.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-allocations.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-allocations.md diff --git a/docs/account-project-management/allocation-management/sbank-list-jobs.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-jobs.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-jobs.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-jobs.md diff --git a/docs/account-project-management/allocation-management/sbank-list-projects.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-projects.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-projects.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-projects.md diff --git a/docs/account-project-management/allocation-management/sbank-list-transactions.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-transactions.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-transactions.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-transactions.md diff --git a/docs/account-project-management/allocation-management/sbank-list-users.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list-users.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list-users.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list-users.md diff --git a/docs/account-project-management/allocation-management/sbank-list.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-list.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-list.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-list.md diff --git a/docs/account-project-management/allocation-management/sbank-manpage.md b/docs/account-project-management/allocation-management/not_in_nav/sbank-manpage.md similarity index 100% rename from docs/account-project-management/allocation-management/sbank-manpage.md rename to docs/account-project-management/allocation-management/not_in_nav/sbank-manpage.md diff --git a/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md b/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md index fa2d1657c..b4d1a8475 100644 --- a/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md +++ b/docs/account-project-management/allocation-management/sbank-allocation-accounting-system.md @@ -5,22 +5,22 @@ The sbank accounting system helps users manage their allocations and usage per j ## Getting Started with sbank -[sbank Example Commands](sbank-examples.md) provides a set of example commands on how to use the most common commands. +[sbank Example Commands](not_in_nav/sbank-examples.md) provides a set of example commands on how to use the most common commands. ## sbank Man Pages Use these sbank man pages to get information on how to use the commands. -- [sbank](sbank-manpage.md) -- [sbank-detail](sbank-detail.md) -- [sbank-detail-allocations](sbank-detail-allocations.md) -- [sbank-detail-jobs](sbank-detail-jobs.md) -- [sbank-detail-projects](sbank-detail-projects.md) -- [sbank-detail-transactions](sbank-detail-transactions.md) -- [sbank-detail-users](sbank-detail-users.md) -- [sbank-list](sbank-list.md) -- [sbank-list-allocations](sbank-list-allocations.md) -- [sbank-list-jobs](sbank-list-jobs.md) -- [sbank-list-projects](sbank-list-projects.md) -- [sbank-list-transactions](sbank-list-transactions.md) -- [sbank-list-users](sbank-list-users.md) +- [sbank](not_in_nav/sbank-manpage.md) +- [sbank-detail](not_in_nav/sbank-detail.md) +- [sbank-detail-allocations](not_in_nav/sbank-detail-allocations.md) +- [sbank-detail-jobs](not_in_nav/sbank-detail-jobs.md) +- [sbank-detail-projects](not_in_nav/sbank-detail-projects.md) +- [sbank-detail-transactions](not_in_nav/sbank-detail-transactions.md) +- [sbank-detail-users](not_in_nav/sbank-detail-users.md) +- [sbank-list](not_in_nav/sbank-list.md) +- [sbank-list-allocations](not_in_nav/sbank-list-allocations.md) +- [sbank-list-jobs](not_in_nav/sbank-list-jobs.md) +- [sbank-list-projects](not_in_nav/sbank-list-projects.md) +- [sbank-list-transactions](not_in_nav/sbank-list-transactions.md) +- [sbank-list-users](not_in_nav/sbank-list-users.md) diff --git a/docs/account-project-management/project-management/starting-alcf-award.md b/docs/account-project-management/project-management/starting-alcf-award.md index 0ef6b2543..59fc9f991 100644 --- a/docs/account-project-management/project-management/starting-alcf-award.md +++ b/docs/account-project-management/project-management/starting-alcf-award.md @@ -106,7 +106,7 @@ The ALCF will send you a report template at the end of each quarter. Please comp Please be aware that we will periodically monitor, and could potentially adjust, your project allocation if a large portion of it goes unused. You may view: [Pullback Policy](../../policies/queue-scheduling/pullback-policy.md) ### Allocation Overburn Policy -Please see this page for overburn/overuse eligibility for INCITE projects that have exhausted their allocation in the first 11 months of its allocation year: [Allocation Overburn](../../../policies/queue-scheduling/queue-and-scheduling-policy/#incitealcc-overburn-policy) +Please see this page for overburn/overuse eligibility for INCITE projects that have exhausted their allocation in the first 11 months of its allocation year: [Allocation Overburn](../../policies/queue-scheduling/queue-and-scheduling-policy.md#incitealcc-overburn-policy) ### Acknowledgment In Publications Please follow the guidelines provided on the [ALCF Acknowledgement Policy page](../../policies/alcf-acknowledgement-policy.md) to properly acknowledge the use of ALCF resources in all of your publications, both online and print. diff --git a/docs/ai-testbed/cerebras/README.md b/docs/ai-testbed/cerebras/README.md deleted file mode 100644 index 121a8d56c..000000000 --- a/docs/ai-testbed/cerebras/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Cerebras - -[System Overview](./system-overview.md) - -[Getting Started](./getting-started.md) - -[Customizing Environment](./customizing-environment.md) - -[Running a Model/Program](./running-a-model-or-program.md) - -[Job Queuing and Submission](./job-queuing-and-submission.md) - -[Example Programs](./example-programs.md) - -[Miscellaneous](./miscellaneous.md) diff --git a/docs/ai-testbed/cerebras/performance-tools.md b/docs/ai-testbed/cerebras/performance-tools.md deleted file mode 100644 index e30aec231..000000000 --- a/docs/ai-testbed/cerebras/performance-tools.md +++ /dev/null @@ -1,4 +0,0 @@ -# Performance Tools - -Work in progress. - diff --git a/docs/ai-testbed/index.md b/docs/ai-testbed/getting-started.md similarity index 96% rename from docs/ai-testbed/index.md rename to docs/ai-testbed/getting-started.md index 5369c1669..2b6874800 100644 --- a/docs/ai-testbed/index.md +++ b/docs/ai-testbed/getting-started.md @@ -34,4 +34,4 @@ Submit your proposal requests at: [Allocation Request Page](https://accounts.alc The documentation is based on [MkDocs](https://www.mkdocs.org/){:target="_blank"} and source files are on [GitHub](https://github.com/argonne-lcf/ai-testbed-userdocs). You can contribute to the documentation by creating a pull request. - [Learn more on how to contribute to documentation.](howto-contribute.md) + [Learn more on how to contribute to documentation.](https://github.com/argonne-lcf/user-guides/blob/main/README.md) diff --git a/docs/ai-testbed/graphcore/README.md b/docs/ai-testbed/graphcore/README.md deleted file mode 100644 index 88009a343..000000000 --- a/docs/ai-testbed/graphcore/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# Graphcore -[System Overview](./system-overview.md)
-[Getting Started](./getting-started.md)
-[Virtual Environment](./virtual-environments.md)
-[Running a Model/Program](./running-a-model-or-program.md)
-[Job Queuing and Submission](./job-queuing-and-submission.md)
-[Example Programs](./example-programs.md)
-[Documentation](./documentation.md)
-[Miscellaneous](./miscellaneous.md)
diff --git a/docs/ai-testbed/graphcore/Scaling-ResNet50.md b/docs/ai-testbed/graphcore/unused/Scaling-ResNet50.md similarity index 97% rename from docs/ai-testbed/graphcore/Scaling-ResNet50.md rename to docs/ai-testbed/graphcore/unused/Scaling-ResNet50.md index f462c7081..39eae342d 100644 --- a/docs/ai-testbed/graphcore/Scaling-ResNet50.md +++ b/docs/ai-testbed/graphcore/unused/Scaling-ResNet50.md @@ -1,6 +1,6 @@ # Scaling ResNet50 -Follow all the instructions in [Getting Started](/docs/graphcore/Getting-Started) to log into a Graphcore node. +Follow all the instructions in [Getting Started](../getting-started.md) to log into a Graphcore node. ## Examples Repo @@ -131,12 +131,12 @@ You should see: # gc-poplar-04:22 SSH-2.0-OpenSSH_8.2p1 Ubuntu-4ubuntu0.5 ``` -## Benchmarks.yml +## `benchmarks.yml` Update **${HOME}/graphcore/examples/vision/cnns/pytorch/train/benchmarks.yml** -with your favorite editor to match [benchmarks.yml](/docs/graphcore/benchmarks.yml). +with your favorite editor to match [benchmarks.yml](./files/benchmarks.yml). -## Configs.yml +## `configs.yml` Update **${HOME}/graphcore/examples/vision/cnns/pytorch/train/configs.yml** with your favorite editor. At about line 30, change **use_bbox_info: true** to diff --git a/docs/ai-testbed/graphcore/cosmictagger-conversion.md b/docs/ai-testbed/graphcore/unused/cosmictagger-conversion.md similarity index 100% rename from docs/ai-testbed/graphcore/cosmictagger-conversion.md rename to docs/ai-testbed/graphcore/unused/cosmictagger-conversion.md diff --git a/docs/ai-testbed/graphcore/cosmictagger-ddp.md b/docs/ai-testbed/graphcore/unused/cosmictagger-ddp.md similarity index 100% rename from docs/ai-testbed/graphcore/cosmictagger-ddp.md rename to docs/ai-testbed/graphcore/unused/cosmictagger-ddp.md diff --git a/docs/ai-testbed/graphcore/files/Graph_Ananlyser_main.jpg b/docs/ai-testbed/graphcore/unused/files/Graph_Ananlyser_main.jpg similarity index 100% rename from docs/ai-testbed/graphcore/files/Graph_Ananlyser_main.jpg rename to docs/ai-testbed/graphcore/unused/files/Graph_Ananlyser_main.jpg diff --git a/docs/ai-testbed/graphcore/benchmarks.yml b/docs/ai-testbed/graphcore/unused/files/benchmarks.yml similarity index 100% rename from docs/ai-testbed/graphcore/benchmarks.yml rename to docs/ai-testbed/graphcore/unused/files/benchmarks.yml diff --git a/docs/ai-testbed/graphcore/files/image.png b/docs/ai-testbed/graphcore/unused/files/graphcore-sys-view.png similarity index 100% rename from docs/ai-testbed/graphcore/files/image.png rename to docs/ai-testbed/graphcore/unused/files/graphcore-sys-view.png diff --git a/docs/ai-testbed/graphcore/multi-node-setup.md b/docs/ai-testbed/graphcore/unused/multi-node-setup.md similarity index 100% rename from docs/ai-testbed/graphcore/multi-node-setup.md rename to docs/ai-testbed/graphcore/unused/multi-node-setup.md diff --git a/docs/ai-testbed/graphcore/profiling-mnist.md b/docs/ai-testbed/graphcore/unused/profiling-mnist.md similarity index 55% rename from docs/ai-testbed/graphcore/profiling-mnist.md rename to docs/ai-testbed/graphcore/unused/profiling-mnist.md index 26fdb4169..5496c9b2f 100644 --- a/docs/ai-testbed/graphcore/profiling-mnist.md +++ b/docs/ai-testbed/graphcore/unused/profiling-mnist.md @@ -1,10 +1,10 @@ # Profiling MNIST -Follow all the instructions in [Getting Started](/docs/graphcore/Getting-Started) to log into a Graphcore node. +Follow all the instructions in [Getting Started](../getting-started.md) to log into a Graphcore node. -Follow the instructions in [Virtual Environments](/docs/graphcore/Virtual-Environments) up to and including **PopART Environment Setup**. +Follow the instructions in [Virtual Environments](../virtual-environments.md) up to and including **PopART Environment Setup**. -Following the instructions in [Example Programs](/docs/graphcore/Example-Programs) up to and including +Following the instructions in [Example Programs](../example-programs.md) up to and including **MNIST, Install Requirements**. ## Change Directory @@ -33,4 +33,4 @@ Do so by running the following command: python mnist_poptorch.py ``` -When MNIST has finished running, see [Profiling](/docs/graphcore/Profiling) to use **Graph Analyser**. +When MNIST has finished running, see [Profiling](./profiling.md) to use **Graph Analyser**. diff --git a/docs/ai-testbed/graphcore/profiling-resnet50.md b/docs/ai-testbed/graphcore/unused/profiling-resnet50.md similarity index 80% rename from docs/ai-testbed/graphcore/profiling-resnet50.md rename to docs/ai-testbed/graphcore/unused/profiling-resnet50.md index 201ae1a1c..1eca7f505 100644 --- a/docs/ai-testbed/graphcore/profiling-resnet50.md +++ b/docs/ai-testbed/graphcore/unused/profiling-resnet50.md @@ -1,8 +1,8 @@ # Profiling ResNet50 -Follow all the instructions in [Getting Started](/docs/graphcore/Getting-Started) to log into a Graphcore node. +Follow all the instructions in [Getting Started](../getting-started.md) to log into a Graphcore node. -Follow the instructions in [Virtual Environments](/docs/graphcore/Virtual-Environments) up to and including **PopART Environment Setup**. +Follow the instructions in [Virtual Environments](../virtual-environments.md) up to and including **PopART Environment Setup**. ## Examples Repo @@ -58,4 +58,4 @@ python3 -m examples_utils benchmark --spec benchmarks.yml --benchmark pytorch_re ## Profile Results -When ResNet50 has finished running, see [Profiling](/docs/graphcore/Profiling) to use **Graph Analyser**. +When ResNet50 has finished running, see [Profiling](./profiling.md) to use **Graph Analyser**. diff --git a/docs/ai-testbed/graphcore/profiling.md b/docs/ai-testbed/graphcore/unused/profiling.md similarity index 97% rename from docs/ai-testbed/graphcore/profiling.md rename to docs/ai-testbed/graphcore/unused/profiling.md index 5cc42a88e..080d99535 100644 --- a/docs/ai-testbed/graphcore/profiling.md +++ b/docs/ai-testbed/graphcore/unused/profiling.md @@ -69,7 +69,7 @@ cd /path/to/graph/analyser/directory ![Graph Analyser](files/Graph_Ananlyser_main.jpg "Graph Analyser") -![Graphcore System View](files/image.png "Graphcore System View") +![Graphcore System View](files/graphcore-sys-view.png "Graphcore System View") 1. Click **Open a report...**; 2. Click the **remote** tab; diff --git a/docs/ai-testbed/habana/getting-started.md b/docs/ai-testbed/habana/getting-started.md deleted file mode 100644 index e7973c99e..000000000 --- a/docs/ai-testbed/habana/getting-started.md +++ /dev/null @@ -1,67 +0,0 @@ -# Getting Started - -## On-Boarding - -See [Get Started](https://www.alcf.anl.gov/support-center/get-started) -to request an acccount and additional information. - -## Setup - -### System View - -Connection to a Sambanova node is a two step process. First step is to ssh to a "login node". -This step requires a MFA passcode for authentication - a 8 digit passcode generated by an app on your mobile device (e.g. mobilePASS+). -The second step is to login to a sambanova node from the login node. -In the examples below, replace ALCFUserID with your ALCF user id. -![SambaNova System View](Log_in.png "SambaNova System View") - -### Login to Login Node - -Login to the SambaNova login node from your local machine using the below command. This uses the MobilPass+ token generated everytime you login to the system. This is the same passcode used to authenticate into other ALCF systems, such as Theta and Cooley. - -```bash -ssh ALCFUserID@sambanova.alcf.anl.gov -ALCFUserID@sambanova.alcf.anl.govs password: < MobilPass+ code > -``` - -Note: Use the ssh "-v" option in order to debug any ssh problems. - -### Login to SambaNova Node - -Once you are on the login node, the sambanova system can be accessed using the alias “sm-01” that resolves to hostname sm-01.ai.alcf.anl.gov. - -```bash -ssh sm-01 -``` - -### SDK setup - -The SambaNova system has a bash shell script to setup the required software environment. -This sets up the SambaFlow software stack, the associated environmental variables and activates -a pre-configured virtual environment. - -Use - -```bash -ALCFUserID@sm-01:~$ source /software/sambanova/envs/sn_env.sh -(venv) ALCFUserID@sm-01:~$ -``` - -The contents of the sn_env.sh script is shown below for convenience. - -```bash -alias snpath='export PATH=$PATH:/opt/sambaflow/bin' # This is the path to SambaFlow which is the software stack that is running on SambaNova systems. This stack includes the Runtime, the compilers, and the SambaFlow Python SDK which is used to create and run models. - -alias snthreads='export OMP_NUM_THREADS=1' # The OMP_NUM_THREADS environment variable sets the number of threads to use for parallel regions. The value of this environment variable must be a list of positive integer values. The values of the list set the number of threads to use for parallel regions at the corresponding nested levels.For the SambaNova system it is usually set to 1. - -alias snvenv='source /opt/sambaflow/venv/bin/activate' # This activates the pre-configured virtual environment that consists of sambaflow and other built-in libraries. -``` - -**NOTE: SambaNova operations will fail unless the SambaNova venv is set -up.** - -You may deactivate the environment if finished. - -```bash -deactivate -``` diff --git a/docs/ai-testbed/howto-contribute.md b/docs/ai-testbed/howto-contribute.md deleted file mode 100644 index 6c6366d9f..000000000 --- a/docs/ai-testbed/howto-contribute.md +++ /dev/null @@ -1,76 +0,0 @@ -# AI Testbed User Guide - -## Contributing to Documentation - -### Python environment - -To build documentation locally, you need a Python environment with `mkdocs` installed. Check that Python 3.6+ is installed: - -``` -$ python --version -Python 3.8.3 -``` - -Then create a new virtual env to isolate the `mkdocs` installation: -``` -$ python -m venv env -$ source env/bin/activate -``` - -### Git - -Using Git ssh. Make sure you add ssh public key to your profile. - -Https cloning can be used with a Personal Access Token. - -``` -$ git clone git@github.com:argonne-lcf/ai-testbed-userdocs.git -``` - -### Installing Mkdocs - -To install `mkdocs` in the current environment: - -``` -$ cd ai-testbed-userdocs -$ make install-dev -``` - -### Preview the Docs Locally - -This launches a server. Do this in a seperate terminal. - -Run `mkdocs serve` or `make serve` to auto-build and serve the docs for preview in your web browser. - -``` -$ make serve -``` - -### Working on documentation - -* All commits must have the commit comment -* Create your own branch from the main branch. For this writing we are using YOURBRANCH as an example. - -``` -$ cd ai-testbed-userdocs -$ git fetch --all -$ git checkout main -$ git pull origin main -$ git checkout -b YOURBRANCH -$ git push -u origin YOURBRANCH -``` -* Commit your changes to the remote repo -``` -$ cd ai-testbed-userdocs -$ git status # check the status of the files you have editted -$ git commit -a -m "Updated docs" # preferably one issue per commit -$ git status # should say working tree clean -$ git push origin YOURBRANCH # push YOURBRANCH to origin -$ git checkout main # move to the local main -$ git pull origin main # pull the remote main to your local machine -$ git checkout YOURBRANCH # move back to your local branch -$ git merge main # merge the local develop into **YOURBRANCH** and - # make sure NO merge conflicts exist -$ git push origin YOURBRANCH # push the changes from local branch up to your remote branch -``` -* Create pull request from https://github.com/argonne-lcf/ai-testbed-userdocs from YOURBRANCH to main branch. diff --git a/docs/ai-testbed/sambanova_gen1/example-programs.md b/docs/ai-testbed/sambanova_gen1/example-programs.md index e2cb50fd8..0a7434190 100644 --- a/docs/ai-testbed/sambanova_gen1/example-programs.md +++ b/docs/ai-testbed/sambanova_gen1/example-programs.md @@ -273,7 +273,7 @@ cp -r /opt/sambaflow/apps/image ~/apps/image cd ~/apps/image/unet ``` -Using the contents of [unet_compile_run_inf_rl.sh](unet_compile_run_inf_rl.sh), create a file in the current directory with the same name. +Using the contents of [unet_compile_run_inf_rl.sh](files/unet_compile_run_inf_rl.sh), create a file in the current directory with the same name. Export the path to the dataset which is required for the training. diff --git a/docs/ai-testbed/sambanova_gen1/2022-09-21T19-21-05.html b/docs/ai-testbed/sambanova_gen1/files/2022-09-21T19-21-05.html similarity index 100% rename from docs/ai-testbed/sambanova_gen1/2022-09-21T19-21-05.html rename to docs/ai-testbed/sambanova_gen1/files/2022-09-21T19-21-05.html diff --git a/docs/ai-testbed/sambanova_gen1/bw_unet_compile_run_all.sh b/docs/ai-testbed/sambanova_gen1/files/bw_unet_compile_run_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/bw_unet_compile_run_all.sh rename to docs/ai-testbed/sambanova_gen1/files/bw_unet_compile_run_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/ccle_09_19_22_11_50.log b/docs/ai-testbed/sambanova_gen1/files/ccle_09_19_22_11_50.log similarity index 100% rename from docs/ai-testbed/sambanova_gen1/ccle_09_19_22_11_50.log rename to docs/ai-testbed/sambanova_gen1/files/ccle_09_19_22_11_50.log diff --git a/docs/ai-testbed/sambanova_gen1/tmpeo5ehksn.html b/docs/ai-testbed/sambanova_gen1/files/tmpeo5ehksn.html similarity index 100% rename from docs/ai-testbed/sambanova_gen1/tmpeo5ehksn.html rename to docs/ai-testbed/sambanova_gen1/files/tmpeo5ehksn.html diff --git a/docs/ai-testbed/sambanova_gen1/unet_all.sh b/docs/ai-testbed/sambanova_gen1/files/unet_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_all.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_batch.sh b/docs/ai-testbed/sambanova_gen1/files/unet_batch.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_batch.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_batch.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_compile_run_all.sh b/docs/ai-testbed/sambanova_gen1/files/unet_compile_run_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_compile_run_all.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_compile_run_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_compile_run_inf_rl.sh b/docs/ai-testbed/sambanova_gen1/files/unet_compile_run_inf_rl.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_compile_run_inf_rl.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_compile_run_inf_rl.sh diff --git a/docs/ai-testbed/sambanova_gen1/unet_compile_run_parallel_all.sh b/docs/ai-testbed/sambanova_gen1/files/unet_compile_run_parallel_all.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen1/unet_compile_run_parallel_all.sh rename to docs/ai-testbed/sambanova_gen1/files/unet_compile_run_parallel_all.sh diff --git a/docs/ai-testbed/sambanova_gen1/uno.yaml b/docs/ai-testbed/sambanova_gen1/files/uno.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno.yaml diff --git a/docs/ai-testbed/sambanova_gen1/uno_bruce_tmp.yaml b/docs/ai-testbed/sambanova_gen1/files/uno_bruce_tmp.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno_bruce_tmp.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno_bruce_tmp.yaml diff --git a/docs/ai-testbed/sambanova_gen1/uno_brw_CCLE.yaml b/docs/ai-testbed/sambanova_gen1/files/uno_brw_CCLE.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno_brw_CCLE.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno_brw_CCLE.yaml diff --git a/docs/ai-testbed/sambanova_gen1/uno_rick_tmp.yaml b/docs/ai-testbed/sambanova_gen1/files/uno_rick_tmp.yaml similarity index 100% rename from docs/ai-testbed/sambanova_gen1/uno_rick_tmp.yaml rename to docs/ai-testbed/sambanova_gen1/files/uno_rick_tmp.yaml diff --git a/docs/ai-testbed/sambanova_gen1/readme-rick.md b/docs/ai-testbed/sambanova_gen1/readme-rick.md deleted file mode 100644 index fb71a39f1..000000000 --- a/docs/ai-testbed/sambanova_gen1/readme-rick.md +++ /dev/null @@ -1,253 +0,0 @@ -# SambaTune - -## Notes - -```text -#TODOBRW -ssh wilsonb@homes.cels.anl.gov -ssh sm-02 -MobilePass+ password -On sm-02 -source /opt/sambaflow/venv/bin/activate -sambatune_ui --directory /home/wilsonb/tmp/sambatune_gen --port 8580 -#There will be a username and password displayed that you will use in your browser on your laptop. -Command used on laptop for port forward -ssh -XL 8580:127.0.0.1:8580 wilsonb@sm-02.cels.anl.gov -MobilePass+ password -# You will be logged into sm-02 but, you do not need to do anything. -address used in browser on laptop localhost:8580 -#Use username and password from sambatune_ui. -Username -Password -``` - -### Rick - -8/24/2022 - -I have updated ~rweisner/tmp/sambatune with sambatune_ui 1.1 and updated the readme. - -## About SambaTune - -SambaTune is a tool for profiling, debugging, and tuning performance of applications -running on SN hardware. - -The tool automates collection of hardware performance counters, metrics aggregation, -report generation, and visualization. It also automates benchmarking of the application -to compute average throughput over a sufficient number of runs. The tool is designed to -aid the user with performance bottleneck analysis and tuning. - -SambaTune is currently used by SN engineers involved in performance tuning efforts. -SambaTune is also planned for release to external customers to aid with performance -bottleneck analysis and resolution. - -## Installation - -```bash -ssh wilsonb@sambanova.alcf.anl.gov -MobilePass+ pwd -ssh sm-01 -``` - -First, enter the virtual environment on **sm-01** or **sm-02**: - -```bash -source /opt/sambaflow/venv/bin/activate -``` - -## Usage - -```console -usage: sambatune [-h] [--artifact-root ARTIFACT_ROOT] [--disable-override] -[--compile-only | -m MODES [MODES ...]] -[--version] -config -positional arguments: -config -YAML file with model, compile, run configuration. -optional arguments: --h, --help ---artifact-root -show this help message and exit -ARTIFACT_ROOT -Custom location to save compile/run artifacts; -defaults to '$DUMP_ROOT/artifact_root' ---disable-override Reuse the placement from the baseline compilation ---compile-only Run compilation of PEFs for selected modes only --m MODES [MODES ...], --modes MODES [MODES ...] -Select modes to execute from ['benchmark', -'instrument', 'run'] default: ['benchmark'] ---version -version of sambatune and sambaflow. -``` - -## Command Overview - -By default, it will run with the benchmarking mode enabled. Use the --modes flag to run -modes individually or in any combination. -Benchmark-Only: - -```bash -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes benchmark -``` - -Instrument-Only: - -```bash -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes instrument -``` - -All modes: - -```bash -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes instrument -``` - -## Command Example - -### Running - -Run the following example on **sm-01** or **sm-02**: - -```bash -mkdir ~/sambatune -cd ~/sambatune -sambatune small_vae.yaml --artifact_root $(pwd)/artifact_root --modes benchmark instrument run -``` - -where **small_vae.yaml** is a user-specified configuration file: - -### Samples Config File - -The current directory should be **~/sambatune**. - -Create **small_vae.yaml** with the following content using your favorite editor. - -```yaml -small_vae.yaml: -app: /opt/sambaflow/apps/private/anl/moleculevae.py - -model-args: -b 128 --in-width 512 --in-height 512 - -compile-args: compile --plot --enable-conv-tiling --compiler-configs-file /opt/sambaflow/apps/private/anl/moleculevae/compiler_configs_conv.json --mac-v2 --mac-human-decision /opt/sambaflow/apps/private/anl/moleculevae/symmetric_human_decisions_tiled_v2.json - -run-args: --num-iterations 1000 --input-path /var/tmp/dataset/moleculevae/ras1_prot-pops.h5 --out-path ${HOME}/moleculevae_out --model-id 0 --epochs 10 - -env: - OMP_NUM_THREADS: 16 - SF_RNT_FSM_POLL_BUSY_WAIT: 1 - SF_RNT_DMA_POLL_BUSY_WAIT: 1 - CONVFUNC_DEBUG_RUN: 0 -``` - -## Install SambaTune UI on Your Development Machine - -### Copy Conda Tar File on SambaNova - -On sambanova.alcf.anl.gov: - -```bash -mkdir ~/tmp -cd ~/tmp -cp /home/rweisner/tmp/sambatune/sambatune_1.1.tar . -``` - -### Copy Conda Tar File To Your Dev Machine - -On your dev machine: - -```bash -mkdir /tmp -cd /tmp -scp ALCFUserID@sambanova:tmp/sambatune/sambatune_1.1.tar . -# Or -scp ac.rick.weisner@lambda0:tmp/sambatune/sambatune_1.1.tar . -# Or -scp wilsonb@sambanova:tmp/sambatune/sambatune_1.1.tar . -``` - -### Install Docker - -If necessary: - -```bash -sudo apt-get install docker -# Or -sudo snap install docker -``` - -### Docker - -If you have changed directories: - -```bash -cd /tmp -``` - -Load Docker image: - -```bash -sudo docker image load -i sambatune_1.1.tar -``` - -List Docker images: - -```bash -sudo docker image ls -``` - -Your output will look something like: - -```text -REPOSITORY TAG IMAGE ID CREATED SIZE -artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client 1.1 bf1d5834776d 3 months ago 737MB -``` - -This is the image you want -artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client 1.1 bf1d5834776d 3 months ago 737MB - -### Run the Docker Container - -Make a work directory: - -```bash -mkdir -p /path/to/work -# Or -mkdir -p /home/bwilson/sambatune/work -``` - -Run the container: - -```bash -sudo docker container run --mount type=bind,source=/path/to/work,target=/work -it -p 5050:8576 artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client:1.1 -# Or -sudo docker container run --mount type=bind,source=/home/bwilson/sambatune/work,target=/work -it -p 5050:8576 artifacts.sambanovasystems.com/sustaining-docker-lincoln-dev/sambatune/sambatune-client:1.1 -``` - -The first time you run the above command, you will see many layers being loaded. It will load immediate from then on. - -My artifact_root is in /Users/rickw/work/vae_tst/artifact_root. - -Start the UI: -It will tell you the port and password. - -sambatune_ui --directory /work/lincoln/vae_tst/artifact_root/sambatune_gen - -You will see something like: -root@477a49bd9e55:/project# sambatune_ui --directory /work/lincoln/vae_tst/artifact_root/sambatune_gen -Starting server on localhost:8576 with the following directory: /work/lincoln/vae_tst/artifact_root/sambatune_gen -with the, - username: "admin", password: "fd11af8a-edad-11ec-89c9-0242ac110002" - * Serving Flask app 'sambatune.uiwebserver' (lazy loading) - * Environment: production - WARNING: This is a development server. Do not use it in a production deployment. - Use a production WSGI server instead. - * Debug mode: off - * Running on all addresses. - WARNING: This is a development server. Do not use it in a production deployment. - * Running on http://172.17.0.2:8576/ (Press CTRL+C to quit) - -RCW: use localhost:8576 to connect - - -Now connect via browser. diff --git a/docs/ai-testbed/sambanova_gen1/readme.md b/docs/ai-testbed/sambanova_gen1/readme.md deleted file mode 100644 index aeb861e89..000000000 --- a/docs/ai-testbed/sambanova_gen1/readme.md +++ /dev/null @@ -1,15 +0,0 @@ -# SambaNova Documentation - -* compiler-options.pdf -* getting-started.pdf -* intro-tutorial-pytorch.pdf -* release-notes.pdf -* run-examples-language.pdf -* run-examples-pytorch.pdf -* run-examples-vision.pdf -* runtime-faq.pdf -* slurm-sambanova.pdf -* snconfig-userguide.pdf -* sntilestat-manpage.pdf -* using-layernorm.pdf -* using-venvs.pdf diff --git a/docs/ai-testbed/sambanova_gen1/readme_rick_02.md b/docs/ai-testbed/sambanova_gen1/readme_rick_02.md deleted file mode 100644 index 7ee9bbb77..000000000 --- a/docs/ai-testbed/sambanova_gen1/readme_rick_02.md +++ /dev/null @@ -1,22 +0,0 @@ -# Notes - -```bash -source /software/sambanova/envs/sn_env.sh -source ~/.bashrc -cd ~/tmp -cp -rf /home/rweisner/tmp/unet . -cd ~/tmp/unet -export OUTDIR=~/apps/image/unet -export DATADIR=/software/sambanova/dataset/kaggle_3m -sbatch --gres=rdu:1 --tasks-per-node 4 --nodes 2 --nodelist sm-02,sm-01 --cpus-per-task=16 ./unet_batch.sh ${NP} ${NUM_WORKERS} -./unet_compile_run_all.sh compile 256 256 -ll -``` - -```console -On sm-01 or sm-02 -/home/rweisner/tmp/gpt for mpirun -/home/rweisner/tmp/unet -unet_compile_run_all.sh for slurm -./unet_compile_run_all.sh compile 256 256 -``` diff --git a/docs/ai-testbed/sambanova_gen1/sambanova.md b/docs/ai-testbed/sambanova_gen1/sambanova.md deleted file mode 100644 index 4114d6d7a..000000000 --- a/docs/ai-testbed/sambanova_gen1/sambanova.md +++ /dev/null @@ -1,110 +0,0 @@ -# SambaNova - -## PyTorch Mirrors - -See . - -There are two mirrors (in the python docs) used for downloading the -mnist dataset. - -mirrors = [ - 'http://yann.lecun.com/exdb/mnist/', - 'https://ossci-datasets.s3.amazonaws.com/mnist/'] - -[yann.lecun.com](http://yann.lecun.com) appears to be intermittently -broken (503 errors). - -## Resources - -- - -- [Argonne SambaNova Training - 11/20](https://anl.app.box.com/s/bqc101mvt3r7rpxbd2yxjsf623ea3gpe) - -- [https://docs.sambanova.ai](https://docs.sambanova.ai/) Create a - SambaNova account if you do not have one. - -- [Getting Started with - SambaFlow](https://docs.sambanova.ai/sambanova-docs/1.6/developer/getting-started.html) - Skip this one. - -- [Tutorial: Creating Models with - SambaFlow](https://docs.sambanova.ai/sambanova-docs/1.6/developer/intro-tutorial.html) - -- Administrators --- @ryade - -## Further Information - -[Human Decisions Files notes](/display/AI/Human+Decisions+Files+notes) - -## Creating a SambaNova Portal Account to access the documentation portal - -1. Go to [login.sambanova.ai](http://login.sambanova.ai/); - -2. Select the "Sign up" link at the bottom; - -3. Enter your information - - 1. Your ANL email address; - - 2. A password that you choose to access the site; - - 3. First name; - - 4. Last name; - - 5. Alternate email address; - - 6. Use 64693137 for the CLOUD ID; - - 7. Select "Register" button; - - 8. Note: The new web page may be displaying a QR code. Do not navigate away from it. Please edit this page to describe what -happenes for you. - -4. Verify your email address - - 1. Open your ANL email; - - 2. Open the email from Okta; - - 3. Select the "Activate Account" button; - - 4. Select the "Configure factor" button on the displayed web page; - - 5. Select either iPhone or Android for the device time on the new web page; - - 6. Install Okta Verify from the App Store/Google Play Store onto your mobile device.; - - 7. Select "Next" button on the web page; - -5. On your phone - - 1. Open Okta Verify app; - - 2. Select "Get Started" button; - - 3. Select "Next" button; - - 4. Select "Add Account" button; - - 5. Select "Organization" for Account Type; - - 6. Scan the QR Code shown in the browser; - -6. Sign in to the SambaNova web site - - 1. Select the "SambaNova Documentation" button. - -Authorization for sections of the SambaNova site uses the tuple (email -address, cloud id). For ANL users, th*ese **should** be an anl email -address and the cloud id specified above (64693137). (Note: the cloud -id can be changed in the SambaNova user settings.) -**If you are not at Argonne, please send us an email (ai@alcf.anl.gov) -for access. ** - -If you plan to publish, say to a conference, workshop or journal, we -have a review process wherein you share the draft with us -(pre-submission) at and we -will work with SambaNova for the requisite approvals. diff --git a/docs/ai-testbed/sambanova_gen1/performance-tools.md b/docs/ai-testbed/sambanova_gen1/unused/performance-tools.md similarity index 100% rename from docs/ai-testbed/sambanova_gen1/performance-tools.md rename to docs/ai-testbed/sambanova_gen1/unused/performance-tools.md diff --git a/docs/ai-testbed/sambanova_gen1/running-bert-large-on-sn10-8r.md b/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md similarity index 96% rename from docs/ai-testbed/sambanova_gen1/running-bert-large-on-sn10-8r.md rename to docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md index a03fdee70..d059725d2 100644 --- a/docs/ai-testbed/sambanova_gen1/running-bert-large-on-sn10-8r.md +++ b/docs/ai-testbed/sambanova_gen1/unused/running-bert-large-on-sn10-8r.md @@ -1,7 +1,7 @@ # Steps to Run BERT-Large on Sambanova DataScale SN10-8R -* BERT Code is in the [Bert](./bert/) directory here for your reference. - * [transformners_hook.py](./bert/transformers_hook.py): contains code for BERT. + + ## Creating a SambaNova Portal Account to access the documentation portal diff --git a/docs/ai-testbed/sambanova_gen1/sambatune-user-guide.md b/docs/ai-testbed/sambanova_gen1/unused/sambatune-user-guide.md similarity index 100% rename from docs/ai-testbed/sambanova_gen1/sambatune-user-guide.md rename to docs/ai-testbed/sambanova_gen1/unused/sambatune-user-guide.md diff --git a/docs/ai-testbed/sambanova_gen2/README.md b/docs/ai-testbed/sambanova_gen2/README.md deleted file mode 100644 index 70b14f58f..000000000 --- a/docs/ai-testbed/sambanova_gen2/README.md +++ /dev/null @@ -1,23 +0,0 @@ -# SambaNova Gen2 - -List of documentation: - -[System Overview](system-overview.md) - -[Getting Started](getting-started.md) - -[Virtual Environment](virtual-environment.md) - -[Running a Model/Program](running-a-model-or-program.md) - -[Job Queuing and Submission](job-queuing-and-submission.md) - -[Example Programs](example-programs.md) - -[Example Multi-Node Programs](example-multi-node-programs.md) - -[Tunneling and Forwarding Ports](tunneling-and-forwarding-ports.md) - -[Miscellaneous](miscellaneous.md) - -[SambaNova Documentation](documentation.md) diff --git a/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md b/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md index 1cd3a6c06..d1c8f82c2 100644 --- a/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md +++ b/docs/ai-testbed/sambanova_gen2/example-multi-node-programs.md @@ -1,8 +1,6 @@ # Example Multi-Node Programs -In this section we will learn how to extend the UNet2d and Gpt1.5B applications scripts that we introduced in the [Example Programs](/docs/ai-testbed/sambanova_gen2/example-programs.md) to compile and run multiple instances of the model in a data parallel fashion across multiple tiles or across multiple nodes. - - +In this section we will learn how to extend the UNet2d and Gpt1.5B applications scripts that we introduced in the [Example Programs](./example-programs.md) to compile and run multiple instances of the model in a data parallel fashion across multiple tiles or across multiple nodes. ## UNet2d diff --git a/docs/ai-testbed/sambanova_gen2/not_published/sambatune-user-guide.md b/docs/ai-testbed/sambanova_gen2/not_published/sambatune-user-guide.md deleted file mode 100644 index 370835fc9..000000000 --- a/docs/ai-testbed/sambanova_gen2/not_published/sambatune-user-guide.md +++ /dev/null @@ -1,307 +0,0 @@ -# SambaTune - -## Notes - -```bash -cd /home/rweisner/tmp/uno_test -``` - -```text -#TODOBRW -ssh wilsonb@homes.cels.anl.gov -ssh sm-02 -MobilePass+ password -On sm-02 -source /opt/sambaflow/venv/bin/activate -export PATH=/opt/sambaflow/bin:$PATH -sambatune linear_net.yaml --artifact-root $(pwd)/artifact_root --modes benchmark instrument run -sambatune_ui --directory /home/wilsonb/tmp/sambatune_gen --port 8580 -#There will be a username and password displayed that you will use in your browser on your laptop. -Command used on laptop for port forward -ssh -XL 8580:127.0.0.1:8580 wilsonb@sm-02.cels.anl.gov -MobilePass+ password -# You will be logged into sm-02 but, you do not need to do anything. -address used in browser on laptop localhost:8580 -#Use username and password from sambatune_ui. -Username -Password - -#TODOBRW -/home/wilsonb/DL/Sambanova/apps_1.12/private/anl/2022-09-21T19-21-05.html -``` - -## About SambaTune - -SambaTune is a tool for profiling and performance tuning of applications that are running on SambaNova DataScale hardware. - -The tool automates the collection of hardware performance counters, metrics aggregation, -report generation, and visualization. It also automates benchmarking of the application -to compute average throughput over a sufficient number of runs. The tool is designed to -aid the user with performance bottleneck analysis and tuning. - -## Run SambaTune - -```bash -ssh ALCFUserID@sambanova.alcf.anl.gov -# Enter MobilePass+ pass code -ssh sn30-r1-h1 -``` - -```bash -#TODOBRW -ssh wilsonb@sambanova.alcf.anl.gov -# Enter MobilePass+ pass code -ssh sn30-r1-h1 -``` - -## TODO - -Install the SambaTune package on the host that is connected to the SambaNova hardware. - -```bash -sudo apt install -y sambanova-sambatune -sudo apt install -y sambaflow-apps-micros -``` - -## SambaTune Client Installation - -TODO: Waiting for Rick to make a .whl file available. - -## Establish Files - -A sample application, linear_net.py is included with your installation at /opt/sambaflow/apps/micros/linear_net.py. - -### Set Up - -Create the following directory and change to it if you have not already done so. - -```console -mkdir ~/app-test -cd ~/app-test -``` - -### Copy linear_net.py - -A sample application, linear_net.py, is included with your installation at /opt/sambaflow/apps/micros/linear_net.py. - -Copy the file to the current directory: - -```bash -cp /opt/sambaflow/apps/micros/linear_net.py . -``` - -### Create linear_net.yaml - -Create the file **linear_net.yaml** in the current directory using your favorite editor. -Copy the following **yaml**. - -```yaml -app: linear.py -model-args: -b 128 -mb 64 --in-features 512 --out-features 128 -compile-args: compile --plot -run-args: -n 10000 -``` - -## Command Overview - -By default, it will run with the benchmarking mode enabled. Use the --modes flag to run -modes individually or in any combination. -Benchmark-Only: - -```bash -sambatune linear_net.yaml -``` - -Run the application in instrument-only mode. - -> **Note**: The space after -- is required. - -$ sambatune --modes instrument -- /opt/sambaflow/sambatune/configs/linear_net.yaml - -Run in all modes. - -> **Note**: The space after -- is required. - -$ sambatune --modes benchmark instrument run -- /opt/sambaflow/sambatune/configs/linear_net.yaml - -## Usage - -> TODO Update the help - -```console -usage: sambatune [-h] [--artifact-root ARTIFACT_ROOT] [--disable-override] - [--compile-only | -m MODES [MODES ...]] [--version] - config - -positional arguments: - config YAML file with model, compile, run configuration. - -optional arguments: - -h, --help show this help message and exit - --artifact-root ARTIFACT_ROOT - Custom location to save compile/run artifacts; - defaults to '$DUMP_ROOT/artifact_root' (default: None) - --disable-override Reuse the placement from the baseline compilation - (default: False) - --compile-only Run compilation of PEFs for selected modes only - (default: False) - -m MODES [MODES ...], --modes MODES [MODES ...] - Select modes to execute from ['benchmark', - 'instrument', 'run'] (default: ['benchmark']) - --version version of sambatune and sambaflow. -``` - -## Run the sample application - -A sample application, **linear_net.py** is included with your installation at /opt/sambaflow/apps/micros/linear_net.py. - - - - - - -## SambaTune UI - -### Port Availability - -It is recommended that you check if the port you want to use is available. You may check by: - -```bash -ps -elf | grep desired_port -``` - -Example: - -```bash -ps -elf | grep 8576 -``` - -Alternatively, you may check for all ports in use by **sambatune_ui**: - -```bash -ps -elf | grep sambatune_ui -``` - -If you need to free a port that you are finished with, you may use the **kill** command. - -### Start SambaTune UI - -If you followed the above directions, your artifact_root will be at ~/sambatune/artifact_root. - -Start the UI: - -It will tell you the **username** and **password**. - -**NOTE:** It is recommended to use a port other than **8576** in case someone else is using it. Select another port close to **8576**. - -Next - -```bash -sambatune_ui --directory ~/sambatune/artifact_root/sambatune_gen/ --port 8576 -``` - -```bash -#TODOBRW -sambatune_ui --directory ~/sambatune/artifact_root/sambatune_gen/ --port 8580 -sambatune_ui --directory /home/wilsonb/tmp/uno_test/artifact_root/sambatune_gen --port 8580 -username: "admin", password: "4f7cac2c-351e-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "aaf1fc88-35c8-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "bf64e4f8-3831-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "8feca89e-384c-11ed-93a3-f7ef9c6e5d46" -username: "admin", password: "355222d6-3a88-11ed-93a3-f7ef9c6e5d46" -``` - -You will see something like: - -```console -with the, - username: "admin", password: "05c63938-2941-11ed-93a3-f7ef9c6e5d46" -[2022-08-31 15:24:36 +0000] [1344959] [Info] Starting gunicorn 20.1.0 -[2022-08-31 15:24:36 +0000] [1344959] [Info] Listening at: http://0.0.0.0:8576 (1344959) -[2022-08-31 15:24:36 +0000] [1344959] [Info] Using worker: sync -[2022-08-31 15:24:36 +0000] [1345092] [Info] Booting worker with pid: 1345092 -[2022-08-31 15:24:36 +0000] [1345093] [Info] Booting worker with pid: 1345093 -``` - -**NOTE:** Write down the username and password. - -**NOTE:** The password only works with this one instance of sambatune_ui. If you stop this instance of sambatune_ui and start another instance, it will have a new password. - -**NOTE:** You will need to **>** or use the **kill** command to stop sambatune_ui when you have finished. -Not doing so will tie up the port. -You can **ps -elf | grep the_port_you_used** to find the running processes. -If you are not comfortable doing this, please ask for help. - -## Use Port-Forwarding - -This describes the steps to set up port-forwarding for applications, -like SambaTune UI, which runs on the SambaNova system and binds to one or more ports. -This example uses 8576 and 18576 as port numbers. **Using port numbers other than these may -avoid collisions with other users.** - -### From your local machine - -This command sets up a port forward SambaNova login node to your local machine. - -Run - -```bash -ssh -N -f -L localhost:18576:localhost:18576 ALCFUserID@sambanova.alcf.anl.gov -... -Password: < MobilPass+ code > - -ssh ALCFUserID@sambanova.alcf.anl.gov -``` - -```bash -#TODOBRW -ssh -v -N -f -L localhost:8580:localhost:8580 wilsonb@sambanova.alcf.anl.gov -ssh -N -f -L localhost:8580:localhost:8580 wilsonb@sambanova.alcf.anl.gov -... -Password: < MobilPass+ code > - -ssh wilsonb@sambanova.alcf.anl.gov -``` - -*replacing* ***ALCFUserID*** *with your ALCF User ID.* - -### From **sambanova.alcf.anl.gov** - -This command sets up a port forward from a SambaNova node to the sambanova login machine. - -Below are the commands specific to sm-01. You may replace **sm-01** with **sm-02** when using that system. - -Run - -**NOTE: The full name is sm-01.ai.alcf.anl.gov and it may also be used.** - -```bash -ssh -N -f -L localhost:18576:localhost:8576 ALCFUserID@sm-01 -``` - -```bash -#TODOBRW -ssh -N -f -L localhost:8580:localhost:8580 wilsonb@sm-01 -``` - -### Browser on Local Machine - -Then, navigate in your browser to, in this example, [http://localhost:18576](http://localhost:18576) on your local machine. - -Use the username and password from **sm-01** to log in. - -## SSH Notes - -Explanation of **ssh** command: - -```text --N : no remote commands - --f : put ssh in the background - --L ::: : - -The full command line will forward : (local scope) to : (remote scope) -``` - -Adapted from: [How can I run Tensorboard on a remote server?](https://stackoverflow.com/questions/37987839/how-can-i-run-tensorboard-on-a-remote-server) diff --git a/docs/ai-testbed/sambanova_gen2/cosmictagger-conversion.md b/docs/ai-testbed/sambanova_gen2/unused/cosmictagger-conversion.md similarity index 100% rename from docs/ai-testbed/sambanova_gen2/cosmictagger-conversion.md rename to docs/ai-testbed/sambanova_gen2/unused/cosmictagger-conversion.md diff --git a/docs/ai-testbed/sambanova_gen2/not_published/files/Gpt1.5B.sh b/docs/ai-testbed/sambanova_gen2/unused/files/Gpt1.5B.sh similarity index 100% rename from docs/ai-testbed/sambanova_gen2/not_published/files/Gpt1.5B.sh rename to docs/ai-testbed/sambanova_gen2/unused/files/Gpt1.5B.sh diff --git a/docs/ai-testbed/sambanova_gen2/performance-tools.md b/docs/ai-testbed/sambanova_gen2/unused/performance-tools.md similarity index 100% rename from docs/ai-testbed/sambanova_gen2/performance-tools.md rename to docs/ai-testbed/sambanova_gen2/unused/performance-tools.md diff --git a/docs/ai-testbed/sambanova_gen2/not_published/running-GPT2-multi-node.md b/docs/ai-testbed/sambanova_gen2/unused/running-GPT2-multi-node.md similarity index 100% rename from docs/ai-testbed/sambanova_gen2/not_published/running-GPT2-multi-node.md rename to docs/ai-testbed/sambanova_gen2/unused/running-GPT2-multi-node.md diff --git a/docs/ai-testbed/sambanova_gen2/running-GPT2.md b/docs/ai-testbed/sambanova_gen2/unused/running-GPT2.md similarity index 100% rename from docs/ai-testbed/sambanova_gen2/running-GPT2.md rename to docs/ai-testbed/sambanova_gen2/unused/running-GPT2.md diff --git a/docs/ai-testbed/sambanova_gen2/not_published/running-bert-large-on-sn30.md b/docs/ai-testbed/sambanova_gen2/unused/running-bert-large-on-sn30.md similarity index 100% rename from docs/ai-testbed/sambanova_gen2/not_published/running-bert-large-on-sn30.md rename to docs/ai-testbed/sambanova_gen2/unused/running-bert-large-on-sn30.md diff --git a/docs/ai-testbed/sambanova_gen2/sambatune-user-guide.md b/docs/ai-testbed/sambanova_gen2/unused/sambatune-user-guide.md similarity index 100% rename from docs/ai-testbed/sambanova_gen2/sambatune-user-guide.md rename to docs/ai-testbed/sambanova_gen2/unused/sambatune-user-guide.md diff --git a/docs/index.md b/docs/index.md index e9a057d61..239f8e039 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,9 +9,9 @@ Our user guides contain information for: - [Theta](theta/hardware-overview/machine-overview.md): Information on how to use our Cray XC40/KNL supercomputer. - [ThetaGPU](theta-gpu/hardware-overview/theta-gpu-machine-overview.md): Information on how to use our NVIDIA DGX A100 supercomputer. - [Cooley](cooley/cooley-overview.md): Information on how to use our visualization cluster. -- [AI Testbed](https://docs.alcf.anl.gov/ai-testbed/): Information on how to use our AI Accelerators. +- [AI Testbed](ai-testbed/getting-started.md): Information on how to use our AI Accelerators. - [Aurora/Sunspot](https://www.alcf.anl.gov/support-center/aurora-sunspot): Information on getting your code ready for our upcoming exacale supercomputer. -- [Services](services/index.md): Information on how to use various services provided across clusters. +- [Services](services/getting-started.md): Information on how to use various services provided across clusters. - [Facility Policies](policies/facility-policies.md): Information on our policies and procedures. ## How to Get Access diff --git a/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md b/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md index 4e607cd2f..e756aa315 100644 --- a/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md +++ b/docs/polaris/compiling-and-linking/compiling-and-linking-overview.md @@ -2,7 +2,7 @@ ## Compiling on Polaris Login and Compute Nodes -If your build system does not require GPUs for the build process, as is usually the case, compilation of GPU-accelerated codes is generally expected to work well on the Polaris login nodes. If your build system _does_ require GPUs, you cannot yet compile on the Polaris login nodes, as they do not currently have GPUs installed. You may in this case compile your applications on the Polaris compute nodes. Do this by submitting an [interactive single-node job](/polaris/running-jobs#Interactive-Jobs-on-Compute-Nodes), or running your build system in a batch job. +If your build system does not require GPUs for the build process, as is usually the case, compilation of GPU-accelerated codes is generally expected to work well on the Polaris login nodes. If your build system _does_ require GPUs, you cannot yet compile on the Polaris login nodes, as they do not currently have GPUs installed. You may in this case compile your applications on the Polaris compute nodes. Do this by submitting an [interactive single-node job](../running-jobs.md#Interactive-Jobs-on-Compute-Nodes), or running your build system in a batch job. diff --git a/docs/polaris/compiling-and-linking/continuous-integration-polaris.md b/docs/polaris/compiling-and-linking/continuous-integration-polaris.md deleted file mode 100644 index 155e8173c..000000000 --- a/docs/polaris/compiling-and-linking/continuous-integration-polaris.md +++ /dev/null @@ -1,4 +0,0 @@ -# Continuous Integration on Polaris - -Content is still being developed. Please check back. - diff --git a/docs/polaris/debugging-tools/debugging-overview.md b/docs/polaris/debugging-tools/debugging-overview.md deleted file mode 100644 index 1e576db9a..000000000 --- a/docs/polaris/debugging-tools/debugging-overview.md +++ /dev/null @@ -1,2 +0,0 @@ -# Debugging Overview -Content is still being developed. Please check back. diff --git a/docs/polaris/performance-tools/performance-overview.md b/docs/polaris/performance-tools/performance-overview.md deleted file mode 100644 index 6e8bb8966..000000000 --- a/docs/polaris/performance-tools/performance-overview.md +++ /dev/null @@ -1,3 +0,0 @@ -# Performance Tools Overview -Content is still being developed. Please check back. - diff --git a/docs/running-jobs/example-job-scripts.md b/docs/running-jobs/example-job-scripts.md index 29c8111a0..e48dd4cda 100644 --- a/docs/running-jobs/example-job-scripts.md +++ b/docs/running-jobs/example-job-scripts.md @@ -7,7 +7,7 @@ A simple example using a similar script on Polaris is available in the ## CPU MPI-OpenMP Examples -The following `submit.sh` example submits a 1-node job to Polaris with 16 MPI ranks per node and 2 OpenMP threads per rank. See [Queues](./job-and-queue-scheduling/#queues) for details on practical limits to node counts and job times for different sizes of jobs. +The following `submit.sh` example submits a 1-node job to Polaris with 16 MPI ranks per node and 2 OpenMP threads per rank. See [Queues](./job-and-queue-scheduling.md#queues) for details on practical limits to node counts and job times for different sizes of jobs. The [`hello_affinity`](https://github.com/argonne-lcf/GettingStarted/tree/master/Examples/Polaris/affinity_gpu) program is a compiled C++ code, which is built via `make -f Makefile.nvhpc` in the linked directory after cloning the [Getting Started](https://github.com/argonne-lcf/GettingStarted) repository. diff --git a/docs/running-jobs/gronkulator.md b/docs/running-jobs/gronkulator.md deleted file mode 100644 index 931000204..000000000 --- a/docs/running-jobs/gronkulator.md +++ /dev/null @@ -1,2 +0,0 @@ -# The Gronkulator: Job Status Display -Content is still being developed. Please check back. diff --git a/docs/running-jobs/job-and-queue-scheduling.md b/docs/running-jobs/job-and-queue-scheduling.md index 65545595d..9c9902203 100644 --- a/docs/running-jobs/job-and-queue-scheduling.md +++ b/docs/running-jobs/job-and-queue-scheduling.md @@ -89,7 +89,7 @@ Where: * `walltime=HH:MM:SS` specifying a wall time is mandatory at the ALCF. Valid wall times depend on the queue you are using. There is a table with the queues for each machine at the end of this section and in the machine specific documentation. * `filesystems=fs1:fs2:...` Specifying which filesystems your application uses is mandatory at ALCF. The reason for this is if a filesystem goes down, we have a way of making PBS aware of that and it won't run jobs that need that filesystem. If you don't specify filesystems you will receive the following error: `qsub: Resource: filesystems is required to be set.` * `place=scatter` is telling PBS you want each of your chunks on a separate vnode. By default, PBS will pack your chunks to get maximum utilization. If you requested `ncpus=1` and `chunks=64` **without** `place=scatter` on a system with `ncpus=64`, all your chunks would end up on one node. -* Your job script: See [Example Job Scripts](../example-job-scripts) for more information about how to build your job script. For options that wont change, you do have the option of taking things off the command line and putting them in your job script. For instance the above command line could be simplified to `qsub -l select=<#> ` if you added the following to the top (the PBS directives have to be before any executable line) of your job script: +* Your job script: See [Example Job Scripts](./example-job-scripts.md) for more information about how to build your job script. For options that wont change, you do have the option of taking things off the command line and putting them in your job script. For instance the above command line could be simplified to `qsub -l select=<#> ` if you added the following to the top (the PBS directives have to be before any executable line) of your job script: ```bash #PBS -A diff --git a/docs/running-jobs/pbs-admin-quick-start-guide.md b/docs/running-jobs/unused/pbs-admin-quick-start-guide.md similarity index 100% rename from docs/running-jobs/pbs-admin-quick-start-guide.md rename to docs/running-jobs/unused/pbs-admin-quick-start-guide.md diff --git a/docs/services/index.md b/docs/services/getting-started.md similarity index 100% rename from docs/services/index.md rename to docs/services/getting-started.md diff --git a/docs/services/jenkins.md b/docs/services/jenkins.md index 41f2343bb..5e98e0f16 100644 --- a/docs/services/jenkins.md +++ b/docs/services/jenkins.md @@ -1,7 +1,7 @@ # Jenkins on Theta ## Jenkins to be decommissioned -New projects should request access to use our GitLab-CI-based service. You can learn how to request access in our documentation found [here](/services/gitlab-ci/#quickstart). +New projects should request access to use our GitLab-CI-based service. You can learn how to request access in our documentation found [here](./gitlab-ci.md#quickstart). Existing projects can continue to use Jenkins. We will notify projects when we have the date it will be retired. Projects will have ample notice to migrate their work to our GitLab-CI service. diff --git a/docs/stylesheets/alcf-extra.css b/docs/stylesheets/alcf-extra.css index 589064f0e..6c4b23aed 100644 --- a/docs/stylesheets/alcf-extra.css +++ b/docs/stylesheets/alcf-extra.css @@ -913,3 +913,12 @@ footer a:hover { .js-dropdown-hidden { display: none; } + +table { + table-layout: fixed; + max-width: 100%; +} + +.md-typeset code { + overflow-wrap: break-word; +} diff --git a/docs/theta-gpu/data-science-workflows/building-python-packages.md b/docs/theta-gpu/data-science-workflows/building-python-packages.md index 6fb47e406..df42771c2 100644 --- a/docs/theta-gpu/data-science-workflows/building-python-packages.md +++ b/docs/theta-gpu/data-science-workflows/building-python-packages.md @@ -4,7 +4,7 @@ To build Python packages for ThetaGPU, there are two options: build on top of a ## Build on ThetaGPU compute using Conda To build on ThetaGPU compute and install your own packages, login to theta and then submit an interactive job to log on to ThetaGPU compute node. -Please see [Running PyTorch with Conda](/dl-frameworks/running-pytorch-conda.md) or [Running TensorFlow with Conda](/dl-frameworks/running-tensorflow-conda/index.html) for more information. +Please see [Running PyTorch with Conda](./dl-frameworks/running-pytorch-conda.md) or [Running TensorFlow with Conda](./dl-frameworks/running-tensorflow-conda.md) for more information. ## Building on top of a container At the moment, you will need two shells to do this: have one open on a login node (for example, ```thetaloginN```, and one open on a compute node (```thetagpuN```). First, start the container in interactive mode: diff --git a/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md b/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md index 1471a22ce..bc6d76b5d 100644 --- a/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md +++ b/docs/theta-gpu/queueing-and-running-jobs/job-and-queue-scheduling.md @@ -13,7 +13,7 @@ As with all Argonne Leadership Computing Facility production systems, job priori * job duration - shorter duration jobs will accumulate priority more quickly, so it is best to specify the job run time as accurately as possible ### Reservations and Scheduling Policy -Some work will require use of Theta that requires deviation from regular policy. On such occasions, normal reservation policy applies. Please send the [regular form](/docs/theta/queueing-and-running-jobs/machine-reservations.md) no fewer than five (5) business days in advance. +Some work will require use of Theta that requires deviation from regular policy. On such occasions, normal reservation policy applies. Please send the [regular form](../../theta/queueing-and-running-jobs/machine-reservations.md) no fewer than five (5) business days in advance. ### Monday Maintenance When the ALCF is on a regular business schedule, preventitive maintenance is typically scheduled on alternate Mondays. The showres command may be used to view pending and active maintenance reservations. diff --git a/docs/theta/data-science-workflows/keras.md b/docs/theta/data-science-workflows/keras.md index ba0806350..f9ba8cee4 100644 --- a/docs/theta/data-science-workflows/keras.md +++ b/docs/theta/data-science-workflows/keras.md @@ -7,26 +7,26 @@ On Theta, we support TensorFlow backend for Keras. To use the datascience Keras module on Theta, please load the following two modules: ``` module load datascience/keras-2.2.4 - + module load datascience/tensorflow-1.12 ``` -Notice that the datascience/tensorflow-* modules were compiled with AVX512 extension on Theta. Therefore, it could not run on login node, otherwise it will issue an “illegal instruction” fault. One has to submit the job to KNL nodes (see TensorFlow documentation for details). +Notice that the `datascience/tensorflow-*` modules were compiled with AVX512 extension on Theta. Therefore, it could not run on login node, otherwise it will issue an "illegal instruction" fault. One has to submit the job to KNL nodes (see TensorFlow documentation for details). -Since we use TensorFlow as the backend, all the optimal environmental setups (Threading + affinity) are applicable here. Please visit the [Tensorflow documentation page](tensorflow) for the optimal setting. +Since we use TensorFlow as the backend, all the optimal environmental setups (Threading + affinity) are applicable here. Please visit the [TensorFlow documentation page](tensorflow.md) for the optimal setting. -We do not see any incompatibility issues in using different versions of keras and tensorflow as those specified above. Feel free to change other versions of keras or TensorFlow. Currently, we support version 2.2.2 and 2.2.4. +We do not see any incompatibility issues in using different versions of keras and tensorflow as those specified above. Feel free to change other versions of Keras or TensorFlow. Currently, we support version 2.2.2 and 2.2.4. ## Distributed learning using Horovod We support distributed learning using Horovod. To use it please load datascience/horovod-0.15.2 module. Please change your python script accordingly -### Initialize Horovod by adding the following lines to the beginning of your python script. +### Initialize Horovod by adding the following lines to the beginning of your Python script. ``` import horovod.keras as hvd - + hvd.init() ``` -After this initialization, the total number of ranks and the rank id could be access through hvd.rank(), hvd.size() functions. +After this initialization, the total number of ranks and the rank id could be access through hvd.rank(), hvd.size() functions. ## Scale the learning rate. Typically, since we use multiple workers, the global batch is usually increased n times (n is the number of workers). The learning rate should increase proportionally as follows (assuming that the learning rate initially is 0.01). @@ -34,19 +34,19 @@ Typically, since we use multiple workers, the global batch is usually increased ``` opt = keras.optimizers.Adadelta(1.0 * hvd.size() ``` -In some case, 0.01*hvd.size() might be too large, one might want to have some warming up steps with smaller learning rate. +In some case, `0.01*hvd.size()` might be too large, one might want to have some warming up steps with smaller learning rate. ### Wrap the optimizer with Distributed Optimizer ``` opt = hvd.DistributedOptimizer(opt) ``` -In such case, opt will automatically average the loss and gradients among all the workers and then perform update. +In such case, opt will automatically average the loss and gradients among all the workers and then perform update. ### Broadcast the model from rank 0, so that all the workers will have the same starting point ``` callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] ``` -Notice that by default, TensorFlow will initialize the parameters randomly. Therefore, by default, different workers will have different parameters. So it is crucial to broadcast the model from rank 0 to other ranks. +Notice that by default, TensorFlow will initialize the parameters randomly. Therefore, by default, different workers will have different parameters. So it is crucial to broadcast the model from rank 0 to other ranks. ### Letting only rank 0 to write checkpoint ``` @@ -55,12 +55,12 @@ if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) ### Loading data according to rank ID -Since we are using data parallel scheme. Different ranks shall process different data. One has to change the data loader part of the python script to ensure different ranks read different mini batches of data. +Since we are using data parallel scheme. Different ranks shall process different data. One has to change the data loader part of the python script to ensure different ranks read different mini batches of data. #### Example A simple example for doing linear regression using Keras + Horovod is put in the follwoing directory on Theta: -/projects/SDL_Workshop/hzheng/examples/keras/linreg - -linreg_keras.py is the python script, and qsub.sc is the COBALT submission script. +/projects/SDL_Workshop/hzheng/examples/keras/linreg + +linreg_keras.py is the python script, and qsub.sc is the COBALT submission script. diff --git a/docs/theta/performance-tools/intel-advisor.md b/docs/theta/performance-tools/intel-advisor.md index 0a9a5aca9..ba7808674 100644 --- a/docs/theta/performance-tools/intel-advisor.md +++ b/docs/theta/performance-tools/intel-advisor.md @@ -98,6 +98,6 @@ There are three other types of collections that can be performed with Advisor fo ## Additional Information: There are many command line options. See [2] for more details on all of the options, and its more comprehensive user guide also available on Intel’s website. - [1] Williams, Samuel, Andrew Waterman, and David Patterson. "Roofline: an insightful visual performance model for multicore architectures." Communications of the ACM 52.4 (2009): 65-76. -- [2] Intel. “Get Started with Intel Advisor.” Intel® Software, Intel, 18 Oct. 2018, [software.intel.com/en-us/get-started-with-advisor-for-more-information](software.intel.com/en-us/get-started-with-advisor-for-more-information) +- [2] Intel. “Get Started with Intel Advisor.” Intel® Software, Intel, 18 Oct. 2018 diff --git a/docs/theta/programming-models/openmp-theta.md b/docs/theta/programming-models/openmp-theta.md index bc16757f3..25dcd29a3 100644 --- a/docs/theta/programming-models/openmp-theta.md +++ b/docs/theta/programming-models/openmp-theta.md @@ -21,7 +21,7 @@ To enable OpenMP, use the following flags in your compile line, depending on the | LLVM | PrgEnv-llvm | “-fopenmp" | ## Running Jobs with OpenMP on Theta -To run jobs on Theta with OpenMP threads, the OpenMP environment variable OMP_NUM_THREADS will need to be set to the desired number of threads per MPI rank, and certain flags in the aprun command will need to be set. Some examples are given below, and more information about running is here: [Affinity on Theta](/docs/theta/queueing-and-running-jobs/affinity-theta.md). +To run jobs on Theta with OpenMP threads, the OpenMP environment variable `OMP_NUM_THREADS` will need to be set to the desired number of threads per MPI rank, and certain flags in the aprun command will need to be set. Some examples are given below, and more information about running is here: [Affinity on Theta](../queueing-and-running-jobs/affinity-theta.md). ### Source code for xthi.c: ``` @@ -94,11 +94,11 @@ int main(int argc, char *argv[]) $ cc -qopenmp xthi.c -o xthi # PrgEnv-intel ``` -2. Run with aprun (either in a batch script that is submitted to the job scheduler a or on the command line as part of an interactive session. See [job scheduling](/docs/theta/queueing-and-running-jobs/job-and-queue-scheduling.md) for more details about how to run. +2. Run with `aprun` (either in a batch script that is submitted to the job scheduler a or on the command line as part of an interactive session. See [job scheduling](../queueing-and-running-jobs/job-and-queue-scheduling.md) for more details about how to run. Mapping of OpenMP threads to hardware threads on a KNL node can be achieved with the “--cc” option in aprun. -One common option described in more detail on [Affinity on Theta](/docs/theta/queueing-and-running-jobs/affinity-theta.md) is to use --cc depth with the -d and -j flags: +One common option described in more detail on [Affinity on Theta](../queueing-and-running-jobs/affinity-theta.md) is to use `--cc depth` with the `-d` and `-j` flags: ``` $ aprun -n 1 -N 1 -d 8 -j 1 -cc depth -e OMP_NUM_THREADS=8 ./a.out @@ -113,7 +113,7 @@ Hello from rank 0, thread 6, on nid03554. (core affinity = 6) Application 19165961 resources: utime ~1s, stime ~1s, Rss ~6284, inblocks ~0, outblocks ~8 ``` -Another option is to use --cc none with OpenMP affinity controls: +Another option is to use `--cc none` with OpenMP affinity controls: ``` $ aprun -n 1 -N 1 -cc none -e OMP_NUM_THREADS=8 -e OMP_PROC_BIND=spread -e OMP_PLACES=cores ./a.out diff --git a/docs/theta/queueing-and-running-jobs/affinity-theta.md b/docs/theta/queueing-and-running-jobs/affinity-theta.md index a7566c782..d6f58ae7e 100644 --- a/docs/theta/queueing-and-running-jobs/affinity-theta.md +++ b/docs/theta/queueing-and-running-jobs/affinity-theta.md @@ -12,7 +12,7 @@ The numbers inside the quadrants identify the specific hardware threads in the c
Visual representation of two tiles in a KNL
-Using the -j, -d, and --cc arguments to aprun and environment variables, MPI ranks and threads can be assigned to run on specific hardware threads. For more information about the flags to aprun, see [Running Jobs on Theta](/docs/theta/queueing-and-running-jobs/job-and-queue-scheduling.md). Four examples of using aprun are given below followed by descriptions of two methods for displaying the mapping produced. +Using the -j, -d, and --cc arguments to aprun and environment variables, MPI ranks and threads can be assigned to run on specific hardware threads. For more information about the flags to aprun, see [Running Jobs on Theta](../queueing-and-running-jobs/job-and-queue-scheduling.md). Four examples of using aprun are given below followed by descriptions of two methods for displaying the mapping produced. **Note:** Logical core and hardware thread are used interchangeably below. diff --git a/mkdocs.yml b/mkdocs.yml index fc46c1590..90a6f2ed0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -25,7 +25,9 @@ nav: - Data Storage: data-management/filesystem-and-storage/data-storage.md - HPSS: data-management/filesystem-and-storage/hpss.md - Disk Quota: data-management/filesystem-and-storage/disk-quota.md - - Services: # services/index.md # Cant directly link to this in the nav sidebar, since it is a dropdown. Only linked to in base docs/index.md + - Services: # services/index.md # Cant directly link to this in the nav sidebar, since + # it is a dropdown. Only linked to in base docs/index.md + - Getting Started: services/getting-started.md - JupyterHub: services/jupyter-hub.md - Continuous Integration: - General: services/continuous-integration.md @@ -142,6 +144,7 @@ nav: - OpenMP: theta/programming-models/openmp-theta.md - Raja: theta/programming-models/raja.md - Debugging Tools: + - Overview: theta/debugging-tools/debugging-overview.md - Arm DDT: theta/debugging-tools/arm-ddt.md - ATP and STAT: theta/debugging-tools/atp-and-stat.md - GDB: theta/debugging-tools/gdb.md @@ -188,7 +191,7 @@ nav: - Darshan: theta-gpu/performance-tools/darshan.md - NVIDIA Nsight: theta-gpu/performance-tools/nvidia-nsight.md - AI Testbed: - - Getting Started: ai-testbed/index.md + - Getting Started: ai-testbed/getting-started.md - Cerebras: - System Overview: ai-testbed/cerebras/system-overview.md - Getting Started: ai-testbed/cerebras/getting-started.md @@ -280,6 +283,13 @@ nav: - Data and Software Policies: - Data Policy: policies/data-and-software-policies/data-policy.md - Software Policy: policies/data-and-software-policies/software-policy.md +not_in_nav: | + not_in_nav/ + unused/ + todo.md + TODO.md + notes.md + theta/theta-decommissioning.md theme: name: 'material' @@ -334,6 +344,10 @@ markdown_extensions: - pymdownx.tasklist: custom_checkbox: true +validation: + omitted_files: warn + absolute_links: warn + unrecognized_links: warn repo_name: 'argonne-lcf/alcf-userguide' repo_url: 'https://github.com/argonne-lcf/alcf-userguide' diff --git a/requirements.txt b/requirements.txt index f3c919456..2d21f0fc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ mkdocs-material -mkdocs +mkdocs>=1.5.0 mkdocs-video mkdocs-include-markdown-plugin mkdocs-codeinclude-plugin>=0.2.1