Merge branch 'master' into jeffra/inject_v2

microsoft · Jan 6, 2021 · a5a34c6 · a5a34c6
2 parents 1b42798 + 5ab1279
commit a5a34c6
Show file tree

Hide file tree

Showing 50 changed files with 1,335 additions and 253 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -4,14 +4,12 @@ name: Build
 
 # Controls when the action will run.
 on:
-  # Triggers the workflow on push or pull request events but only for the master branch
   push:
-    branches: [ master ]
+    paths-ignore:
+    - 'docs/**'
   pull_request:
-    branches: [ master ]
-
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
+    paths-ignore:
+    - 'docs/**'
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:

diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
@@ -0,0 +1,47 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Tests-w-precompiled-ops
+
+# Controls when the action will run.
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          DS_BUILD_OPS=1 pip install .[dev]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
diff --git a/bin/ds_elastic b/bin/ds_elastic
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+
+import deepspeed
+from deepspeed.elasticity import compute_elastic_config
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json")
+    parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size")
+    args = parser.parse_args()
+    ds_config = json.load(open(args.config, 'r'))
+
+    ds_version = deepspeed.__version__
+
+    elastic_config = ds_config['elasticity']
+    print('------------------------------------------')
+    print("Elasticity config:")
+    print('------------------------------------------')
+    print(json.dumps(elastic_config, indent=4, sort_keys=True))
+
+    if args.world_size > 0:
+        final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size)
+        print('------------------------------------------')
+        print(f"Calculated results for world size {args.world_size}:")
+        print('------------------------------------------')
+        print(f'final_batch_size .... {final_batch_size}')
+        print(f'valid_gpus .......... {valid_gpus}')
+        print(f'micro_batch_size .... {micro_batch_size}')
+    else:
+        final_batch_size, valid_gpus = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
+        print('------------------------------------------')
+        print("Calculated results:")
+        print('------------------------------------------')
+        print(f'final_batch_size .... {final_batch_size}')
+        print(f'valid_gpus .......... {valid_gpus}')
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
@@ -14,6 +14,8 @@
 
 static std::unordered_map<int, std::shared_ptr<void>> s_transformer_layers;
 
+const int init_seq_length = 128;
+
 // C++ interface
 
 template <typename T>
@@ -591,7 +593,6 @@ int create_transformer_layer(int layer_id,
                              int hidden_dim,
                              int num_heads,
                              int intermediate_size,
-                             int seq_length,
                              float attn_dropout_ratio,
                              float hidden_dropout_ratio,
                              int seed,
@@ -604,14 +605,14 @@ int create_transformer_layer(int layer_id,
 {
     Context::Instance().SetSeed(seed);
     Context::Instance().TestGemmFP16(
-        test_gemm, batch_size, seq_length, num_heads, hidden_dim / num_heads);
+        test_gemm, batch_size, init_seq_length, num_heads, hidden_dim / num_heads);
 
     auto layer = std::make_shared<BertTransformerLayer<T>>(layer_id,
                                                            batch_size,
                                                            hidden_dim,
                                                            num_heads,
                                                            intermediate_size,
-                                                           seq_length,
+                                                           init_seq_length,
                                                            attn_dropout_ratio,
                                                            hidden_dropout_ratio,
                                                            pre_or_postLayerNorm,
@@ -873,6 +874,12 @@ std::vector<torch::Tensor> ds_transformer_backward(int layer_id,
     std::shared_ptr<BertTransformerLayer<T>> layer =
         std::static_pointer_cast<BertTransformerLayer<T>>(s_transformer_layers[layer_id]);
 
+    int seq_len = layer->GetSeqLength();
+    if (g_output.size(1) != seq_len) {
+        seq_len = g_output.size(1);
+        layer->SetSeqLength(seq_len, bsz);
+    }
+
     auto grad_input = torch::empty_like(input);
     auto grad_attn_qkvw = torch::empty_like(attn_qkvw);
     auto grad_attn_qkvb = torch::empty_like(attn_qkvb);

diff --git a/csrc/transformer/softmax_kernels.cu b/csrc/transformer/softmax_kernels.cu
@@ -80,7 +80,8 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -113,7 +114,8 @@ __global__ void attn_softmax(float* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -216,7 +218,8 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) {
             auto temp = g.shfl_xor(max_val, i);
@@ -252,7 +255,8 @@ __global__ void attn_softmax(__half* vals,
 #endif
 
         int iters = warp_num;
-        if (seq_length < iteration_stride) iters = warp_num / (iteration_stride / seq_length);
+        if (seq_length < iteration_stride)
+            iters = warp_num / (iteration_stride / max_threads_in_sequence);
 
         for (int i = 1; i < iters; i *= 2) { sum += g.shfl_xor(sum, i); }
 
@@ -339,7 +343,9 @@ void launch_attn_softmax<float>(float* vals,
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);
@@ -408,7 +414,9 @@ void launch_attn_softmax<__half>(__half* vals,
         dim3 block_dim(seq_length4 > threads ? ((sequence_length + subblock_max_workload - 1) /
                                                 subblock_max_workload * threads)
                                              : threads);
-
+        iterations =
+            (sequence_length < subblock_max_workload ? (seq_length4 + threads - 1) / threads
+                                                     : MAX_THREAD_ITERATIONS);
         if (sequence_length <= 512)
             attn_softmax<32, (threads / 128), 128><<<grid_dim, block_dim, 0, stream>>>(
                 vals, attn_mask, heads, seq_length4, iterations);

diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
@@ -14,6 +14,7 @@
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .utils import log_dist
+from .utils.distributed import init_distributed
 
 from .pipe import PipelineModule
 

diff --git a/deepspeed/constants.py b/deepspeed/constants.py
@@ -0,0 +1,8 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+
+#############################################
+# Torch distributed constants
+#############################################
+TORCH_DISTRIBUTED_DEFAULT_PORT = 29500
diff --git a/deepspeed/elasticity/__init__.py b/deepspeed/elasticity/__init__.py
@@ -0,0 +1 @@
+from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config
diff --git a/deepspeed/elasticity/config.py b/deepspeed/elasticity/config.py
@@ -0,0 +1,80 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+import json
+from .constants import *
+
+
+class ElasticityError(Exception):
+    """
+    Base exception for all elasticity related errors
+    """
+    pass
+
+
+class ElasticityConfigError(ElasticityError):
+    """
+    Elasticity configuration error
+    """
+    pass
+
+
+class ElasticityIncompatibleWorldSize(ElasticityError):
+    """
+    Attempting to run a world size that is incompatible with a given elastic config
+    """
+    pass
+
+
+class ElasticityConfig:
+    """
+    Elastic config object, constructed from a param dictionary that only contains elastic
+    config parameters, example below:
+
+    If elasticity is enabled, user must specify (at least) max_train_batch_size
+    and micro_batch_sizes.
+
+    {
+        "enabled": true,
+        "max_train_batch_size": 2000,
+        "micro_batch_sizes": [2,4,6],
+        "min_gpus": 1,
+        "max_gpus" : 10000
+        "min_time": 20
+        "ignore_non_elastic_batch_info": false
+        "version": 0.1
+    }
+    """
+    def __init__(self, param_dict):
+        self.enabled = param_dict.get(ENABLED, ENABLED_DEFAULT)
+        if self.enabled:
+            if MAX_ACCEPTABLE_BATCH_SIZE in param_dict:
+                self.max_acceptable_batch_size = param_dict[MAX_ACCEPTABLE_BATCH_SIZE]
+            else:
+                raise ElasticityConfigError(
+                    f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}")
+            if MICRO_BATCHES in param_dict:
+                self.micro_batches = param_dict[MICRO_BATCHES]
+            else:
+                raise ElasticityConfigError(f"Elasticity config missing {MICRO_BATCHES}")
+        else:
+            self.max_acceptable_batch_size = param_dict.get(
+                MAX_ACCEPTABLE_BATCH_SIZE,
+                MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT)
+            self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT)
+        self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT)
+        self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT)
+        self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT)
+        self.version = param_dict.get(VERSION, VERSION_DEFAULT)
+        self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH,
+                                                       PREFER_LARGER_BATCH_DEFAULT)
+        self.ignore_non_elastic_batch_info = param_dict.get(
+            IGNORE_NON_ELASTIC_BATCH_INFO,
+            IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT)
+
+    def repr(self):
+        return self.__dict__
+
+    def __repr__(self):
+        return json.dumps(self.__dict__, sort_keys=True, indent=4)
diff --git a/deepspeed/elasticity/constants.py b/deepspeed/elasticity/constants.py
@@ -0,0 +1,74 @@
+"""
+Copyright 2020 The Microsoft DeepSpeed Team
+"""
+
+#########################################
+# Elasticity
+#########################################
+''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible
+with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that
+can support a large number of GPUs based on the user specified parameters
+'''
+FORMAT = '''
+Elasticity should be enabled as:
+"elasticity": {
+  "enabled": true,
+  "max_train_batch_size": 2000,
+  "micro_batch_sizes": [2,4,6],
+  "min_gpus": 1,
+  "max_gpus" : 10000
+  "min_time": 20,
+  "prefer_larger_batch": true,
+  "ignore_non_elastic_batch_info": false,
+  "version": 0.1
+}
+'''
+
+ELASTICITY = 'elasticity'
+
+# Current elasticity version
+LATEST_ELASTICITY_VERSION = 0.1
+
+ENABLED = 'enabled'
+ENABLED_DEFAULT = False
+
+# Max acceptable train_batch_size
+MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size'
+MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000
+
+# Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu
+MICRO_BATCHES = 'micro_batch_sizes'
+MICRO_BATCHES_DEFAULT = [2, 4, 6]
+
+# Min/max of GPUs to search over
+MIN_GPUS = 'min_gpus'
+MIN_GPUS_DEFAULT = 1
+MAX_GPUS = 'max_gpus'
+MAX_GPUS_DEFAULT = 10000
+
+# Minimum running time (minutes) before the scheduler will scale us
+MIN_TIME = "min_time"
+MIN_TIME_DEFAULT = "20"
+
+# When finding a suitable batch size, attempt to find one that is closest
+# to the max train batch size given.
+PREFER_LARGER_BATCH = 'prefer_larger_batch'
+PREFER_LARGER_BATCH_DEFAULT = True
+
+# In order to reduce confusion, if elastic mode is enabled we
+# require (via assert) that no batch info is set outside of the
+# elastic config. You can turn off this assert via this config
+# but keep in mind that all batch info defined outside the
+# elastic mode *will be ignored*.
+IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info'
+IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False
+
+# Version of elastic logic to use
+VERSION = "version"
+VERSION_DEFAULT = LATEST_ELASTICITY_VERSION
+
+# Minimum deepspeed version to use elasticity
+MINIMUM_DEEPSPEED_VERSION = "0.3.8"
+
+# Environment variable storing elastic config from resource scheduler
+DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG"
+2 −0		BingBertSquad/nvidia_run_squad_deepspeed.py
+1 −1		BingBertSquad/run_squad_deepspeed.sh
+1 −1		BingBertSquad/turing/nvidia_modelingpreln.py
+15 −11		Megatron-LM/pretrain_gpt2.py
+9 −9		bing_bert/bert_large.json
+1 −2		bing_bert/deepspeed_bsz4k_onebit_config_seq128.json
+1 −1		bing_bert/deepspeed_train.py
+2 −3		bing_bert/ds_train_bert_onebit_bsz4k_seq128.sh
+1 −2		bing_bert/mpi_train_bert_onebitadam_bsz4k_seq128.sh
+0 −1		bing_bert/nvidia/modelingpreln.py
+1 −1		pipeline_parallelism/train.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config