diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml index 8bba51dab6fd..2cf7c02923b9 100644 --- a/.github/workflows/cpu-inference.yml +++ b/.github/workflows/cpu-inference.yml @@ -5,6 +5,7 @@ on: paths-ignore: - 'docs/**' - 'blogs/**' + workflow_dispatch: merge_group: branches: [ master ] @@ -17,7 +18,7 @@ jobs: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - id: setup-venv uses: ./.github/workflows/setup-venv diff --git a/README.md b/README.md index 6aef71b8e66e..e4560979b124 100755 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ ## Latest News DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). +* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) * [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)] * [2023/08] [DeepSpeed ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) * [2023/08] [DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md) @@ -234,6 +235,8 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information 24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847) 25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320). 26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) +27. Zhewei Yao, Xiaoxia Wu, Conglong Li, Minjia Zhang, Heyang Qin, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He. (2023) DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention [arXiv:2309.14327](https://arxiv.org/pdf/2309.14327.pdf) + # Videos diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index a87ff3c1d223..b6ef453464a2 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -226,7 +226,11 @@ def LongTensor(self): ... @abc.abstractmethod - def pin_memory(self, tensor): + def pin_memory(self, tensor, align_bytes=1): + ... + + @abc.abstractmethod + def is_pinned(self, tensor): ... @abc.abstractmethod diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index 4de4ad93c2bb..6570a2bdac67 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -225,9 +225,12 @@ def IntTensor(self): def LongTensor(self): return torch.LongTensor - def pin_memory(self, tensor): + def pin_memory(self, tensor, align_bytes=1): return tensor + def is_pinned(self, tensor): + return tensor.is_pinned() + def op_builder_dir(self): try: # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index 045cce510a90..b0aa34970a98 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -210,9 +210,12 @@ def IntTensor(self): def LongTensor(self): return torch.cuda.LongTensor - def pin_memory(self, tensor): + def pin_memory(self, tensor, align_bytes=1): return tensor.pin_memory() + def is_pinned(self, tensor): + return tensor.is_pinned() + def on_accelerator(self, tensor): device_str = str(tensor.device) if device_str.startswith('cuda:'): diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index 63a92f250898..b5488dcdf8ca 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -192,9 +192,12 @@ def IntTensor(self): def LongTensor(self): return - def pin_memory(self, tensor): + def pin_memory(self, tensor, align_bytes=1): return tensor.pin_memory() + def is_pinned(self, tensor): + return tensor.is_pinned() + def on_accelerator(self, tensor): device_str = str(tensor.device) if device_str.startswith("mps"): diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 206bc1dfaa1b..912a6c89e228 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -191,9 +191,12 @@ def IntTensor(self): def LongTensor(self): return torch.npu.LongTensor - def pin_memory(self, tensor): + def pin_memory(self, tensor, align_bytes=1): return tensor.pin_memory() + def is_pinned(self, tensor): + return tensor.is_pinned() + def on_accelerator(self, tensor): device_str = str(tensor.device) if device_str.startswith('npu:'): diff --git a/blogs/deepspeed-visualchat/10-03-2023/README.md b/blogs/deepspeed-visualchat/10-03-2023/README.md new file mode 100755 index 000000000000..06dcd851766c --- /dev/null +++ b/blogs/deepspeed-visualchat/10-03-2023/README.md @@ -0,0 +1,188 @@ +
+ +# DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs + +
+ +
+ +DeepSpeed-VisualChat! + +
+ +To cite DeepSpeed-VisualChat, please cite our [arxiv report](https://arxiv.org/abs/2309.14327): + +``` +@article{yao2023deepspeed-visualchat, + title={{DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention}}, + author={Zhewei Yao and Xiaoxia Wu and Conglong Li and Minjia Zhang and Heyang Qin and Olatunji Ruwase and Ammar Ahmad Awan and Samyam Rajbhandari and Yuxiong He}, + journal={arXiv preprint arXiv:2309.14327}, + year={2023} +} +``` +# 1. Overview +Large Language models (LLMs), such as GPT and LLaMa, have showcased exceptional prowess in a myriad of text generation and comprehension tasks, especially when subjected to zero-/few-shot learning, particularly after instructed fine-tuning. However, to equip AI agents for diverse tasks, one critical feature that needs to be incorporated is multi-modal capability; for instance, the AI agent should be able to read images, hear voices, watch videos, etc. This capability is largely absent in solely text-based LLMs. + +Recently, one of the research/practice mainstreams has begun exploring the incorporation of visual capability into LLMs, especially enabling LLMs to understand images by inserting raw pictures (referred to as large visual language models, or LVLMs in short). + +The main caveats of the majority of existing works are: +* The focus is predominantly on tasks related to a single image, such as visual question answering and captioning, or on handling multiple images that require concurrent input. Neither approach adeptly manages interleaved image-and-text input. +* The scalability of the system is limited to models with ~10B parameters, which is about an order of magnitude smaller than largest open-sourced models. + +However, for a genuine AI chat agent, the content of inputs could be multiple images interleaved with text, a situation rarely addressed by current works. Also, the generation capability of LLMs grows quickly as the model size increases. Therefore, focusing system capability on ~10B models limits further exploration of the potential of LVLMs. + +To resolve these issues, we are introducing DeepSpeed-VisualChat (see [arxiv report](https://arxiv.org/abs/2309.14327) for more details) with the following new features: + +* ***Fully Open-Sourced Multi-round Multi-image Framework with Unprecedented Scalability***: DeepSpeed-VisualChat, one of the pioneering fully open-sourced frameworks, enables multi-round and multi-image dialogues, accommodating interleaved text-and-image inputs. We leverage DeepSpeed to enhance our training with a 2B visual encoder and a 70B LLaMA-2 decoder model, illustrating the remarkable scalability of our framework. +* ***Multi-Modal Causal Attention (MMCA)*** +We devise a novel MMCA for multi-modal models that computes attention weights independently across various modalities. MMCA achieves objectives analogous to conventional cross-attention mechanisms but offers enhanced causal attention interpretations for generative tasks, eliminating the need for additional modules or parameters. It also presents superior training data efficiency compared to standard causal attention. +* ***Data Blending for Interleaved Inputs*** To facilitate conversations with interleaved modalities, DeepSpeed-VisualChat employs assorted data blending techniques on existing datasets, overcoming the shortage of interleaved text-and-image inputs in most available open-source datasets. + + + +# 2 Model architecture overview +
+ model arch + + *Figure 1: Model architecture illustration.* + +
+ +The model architecture of DeepSpeed-VisualChat, as depicted in *Figure 1*, is composed of three components: a visual encoder, such as CLIP; a language decoder, such as LLaMa-7B; and a feature alignment linear projection layer. Most parts of the model are frozen, with only the embedding of the language model and the linear projection layer being trainable. Consequently, the total number of trainable parameters ranges from approximately O(10M) (LLaMa-2-13B) to O(100M) (LLaMa-2-70B). + +# 3. DeepSpeed multi-modal causal attention + +There are two common attention mechanisms used to connect the visual and textual components in a multi-modal model: causal attention, as used in MiniGPT and QWen-VL, and cross attention, as used in Otter and Flamingo. + +
+ Different attention mehanisms +
+ +*Figure 2: Different Attention Mechanisms: Examine the differing attention mechanisms using an input sentence "User: Please describe the image." coupled with three Image tokens (I-token1, I-token2, I-token3). On the left, we demonstrate standard causal attention, treating image tokens as text. In the middle, we present cross attention applied to images, while maintaining standard causal attention for text tokens. On the right, we illustrate our innovative multi-modal attention proposal where image tokens only perform self-attention, and text tokens attend to text/image tokens independently, highlighted with an orange mask. This mechanism is defined by: softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) with Q and K as query and key, $`M_1`$=[M==1], and $`M_2`$=[M==2], with M $`\in`$ R10x10 in this case.* + + +Causal Attention (CA): The CA-based method simply projects visual features (i.e., the features from the output of the final visual encoder layer) into textual features and combines them with the normal textual features after the textual embedding layer to feed into LLMs. The benefit of CA is that it's a natural extension of the original attention mechanism in LLMs, and as such, it doesn't introduce any extra modules or parameters. However, this approach raises some intuitive problems: + +* For a visual token, it attends to previous visual and textual tokens, even though visual tokens are already fully encoded in a bidirectional manner and do not need further attention from other visual tokens or the beginning of textual tokens. +* For a textual token, the model needs to learn how to distribute its attention weights between its previous textual and image tokens. Due to these issues, we found that the data efficiency of CA in LVLMs is often problematic. To address this, LLaVA and QWen-VL require visual-language pretraining to fully align visual features with textual features. + +Cross Attention (CrA): The alternative, cross attention (CrA), along with CA, exhibits better data efficiency but also comes with a few drawbacks: + +* It introduces new parameters to the model. For example, Otter has more than 1.5 billion trained parameters compared to the millions of trained parameters in LLaVA due to the new parameters introduced by cross attention. This significantly increases the training cost and memory requirements. +* It requires careful design if an image is introduced in the middle of a conversation during training, as previous text tokens should not be able to attend to the image. + +Multi-Modal Causal Attention Mechanism (MMCA): To overcome these issues, we propose a new multi-modal causal attention mechanism (MMCA), which has both benefits, i.e., similar parameter efficiency as CA and similar data efficiency as CrA. The overall idea is as follows: + +* For visual tokens, they only attend to themselves, as visual tokens are encoded by the visual encoder. +* For textual tokens, they attend to all their previous tokens. However, they have two separate attention weight matrices for their previous textual tokens and image tokens. + +The intuition behind the second point of MMCA is that the attention weight for one modality may affect the other modality. For instance, a textual token may pay more attention to textual information than visual information. Therefore, if the attention weight matrix is normalized across both modalities, the attention score for visual tokens might be very small. Refer to *Figure 2* for a visualization of the three attention mechanisms. + + +Demo Results. We begin by showcasing various examples that highlight the capabilities of DeepSpeed-VisualChat in single-image visual language conversations, employing different attention mechanisms. In these experiments, we employ the LLaMA2-7B language model in conjunction with the QWen-VL visual-encoder as our visual encoder. These two models are connected via a straightforward linear projection layer. Our model underwent training on two LLaVa datasets. As demonstrated in *Figure 3* and *Figure 4*, DeepSpeed-VisualChat, when coupled with MMCA, effectively discerns visual details in images and furnishes coherent responses to user queries. +Furthermore, DeepSpeed-VisualChat exhibits a more comprehensive and precise grasp of image details compared to alternative attention mechanisms, such as the use of combined masks from both causal attention and cross attention. It is also evident that, in contrast to the combination of CrA and CA, as well as MMCA, CA alone may exhibit slightly more errors (*Figure 3*) and capture a lower degree of reasoning capability (*Figure 4*). + +
+ Small kitten + + *Figure 3: Example visual and language inputs that demonstrate the output comparison between (1) the standard causal attention (CA) (2) the standard causal attention combined with cross-attention (CA+ CrA) and (3) the special multi-modal causal attention (MMCA) in DeepSpeed-VisualChat.* + +
+ +
+ Beautiful lake + + *Figure 4: DeepSpeed-VisualChat accurately identifies the scene as a beautiful lake and offers a set of plausible suggestions. In contrast, the baseline misinterprets the image as containing “dock with a boat ramp”.* + +
+ +# 4. Data blending +We used 9 datasets from 3 sources as described in our [arxiv report](https://arxiv.org/abs/2309.14327). A critical missing element for enabling multi-round and multi-image conversations is the absence of adequate data. The sole source of multi-round multi-image data we located is the SparklesDialogue dataset, which contains a mere 6520 samples. To address this limitation, we employed two methods to synthesize multi-round multi-image data from existing single-image or single-round data: simple data concatenation and LLaVA-Otter data blending. + +## 4.1 Simple data concatenation +For the "llava" and "llava_dial" datasets utilized by the LLaVA model, each sample comprises single/multi-round conversations for a single image. To simulate scenarios where a user sequentially asks questions about multiple images, we conducted straightforward data post-processing for these two datasets. Specifically, we randomly concatenated different numbers of samples into a single sample. In the case of "llava," we concatenated 1 to 3 samples, while for "llava_dial," we concatenated 1 to 2 samples. + +## 4.2 LLaVA-Otter data blending +We noticed that the llava and llava_dial datasets used by LLaVA model and the otter_mimicit_cgd dataset used by the Otter model all use the COCO train2017 images. For the llava and llava_dial datasets, each sample includes a single/multi-round conversations for a single image. For the otter_mimicit_cgd dataset, each sample includes a single-round conversation for a pair of images. This enables us to build a synthesized multi-round multi-image data llava_otter_blend as a more natural blending: for each sample in the otter_mimicit_cgd dataset, we look for llava and llava_dial samples that use the same image, and then build a new sample in a "llava/llava_dial conversations then otter_mimicit_cgd conversation" fashion. + +
+ Friends + + *Figure 5: A data sample after LLaVA-Otter data blending. Gray dialog boxes are from LLaVA datasets, and orange ones are from Otter dataset.* +
+ +# 5. Demonstration +We trained our DeepSpeed-VisualChat-13B model with a 2B visual encoder and the 13B LLaMA model on several open-sourced datasets. DeepSpeed-VisualChat-13B shows image captioning capabilities (*Figure 6--8*), counting and text reading (*Figure 6*), celebrity recognition (*Figure 7*), storytelling (*Figure 8*), etc. + +
+ Friends + + *Figure 6: DeepSpeed-VisualChat can count the number of people in the image and read the text in the first image. It also demonstrates cross-image understanding.* +
+ + +
+ CEO + + *Figure 7: DeepSpeed-VisualChat can recognize celebrities and associate them with their achievements.* +
+ + +
+ Zootopia + + *Figure 8: DeepSpeed-VisualChat can tell stories and recognize movies.* +
+ + +# 6. How to begin with DeepSpeed-VisualChat +DeepSpeed-VisualChat is an easy-to-use training framework with great scalability, having been tested up to LLaMa-2-70B models so far. We adopt a unified instruction tuning format for all experiments, and the template is shown below. +``` + % You are a powerful vision-language assistant. + +### Image 1: % some image, e.g., cat-1.png +### Question: % please describe the image. +### Answer: % It's a cute black cat. + +### Image 2: % some image, e.g., cat-2.png +### Image 3: % some image, e.g., cat-3.png +### Question: % What's the difference between the three cats? +### Answer: % The colors of the three cats are different. +... +``` + +The training experience of DeepSpeed-VisualChat is straightforward and convenient. Here we give an example based on the CLIP visual encoder and the LLaMa-7B model: +``` +git clone https://github.com/microsoft/DeepSpeedExamples.git +cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/ +pip install -r requirements.txt +cd training +bash training_scripts/run_7b.sh +``` + +The trained checkpoint will be automatically saved in a Hugging Face-compatible version and can be used to launch your own visual chat API: +``` +cd ../chat +bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt path +``` +To support larger model inference, we have incorporated Hugging Face large model inference into our DeepSpeed-VisualChat API. Therefore, users can choose a different number of GPUs based on the GPU memory capacity and the model size. + +Please refer to our [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) for more details. + +# 7. Release: Try DeepSpeed-VisualChat today! + +We are very excited to share that DeepSpeed-VisualChat is now open-sourced and available to the AI community. + +* To get started, please visit our GitHub page for DeepSpeed-VisualChat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) + +* We will continue to improve DeepSpeed-VisualChat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) shows currently supported features as well as ones that are planned for the future. + + +DeepSpeed-VisualChat is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more, + +* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation. +* Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed. + +We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com. + +* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work! diff --git a/blogs/deepspeed-visualchat/assets/images/attention.png b/blogs/deepspeed-visualchat/assets/images/attention.png new file mode 100644 index 000000000000..b01d8f8027ce Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/attention.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/cat-chat.png b/blogs/deepspeed-visualchat/assets/images/cat-chat.png new file mode 100755 index 000000000000..5a5c27381f65 Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/cat-chat.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/ceos.png b/blogs/deepspeed-visualchat/assets/images/ceos.png new file mode 100644 index 000000000000..e148f545a44b Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/ceos.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/data-blending.png b/blogs/deepspeed-visualchat/assets/images/data-blending.png new file mode 100644 index 000000000000..a8afb5144fb1 Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/data-blending.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/friends.png b/blogs/deepspeed-visualchat/assets/images/friends.png new file mode 100644 index 000000000000..2689d8d4bb1c Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/friends.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/hero-figure.png b/blogs/deepspeed-visualchat/assets/images/hero-figure.png new file mode 100644 index 000000000000..ca79b2c6239f Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/hero-figure.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/lake-chat.png b/blogs/deepspeed-visualchat/assets/images/lake-chat.png new file mode 100755 index 000000000000..c47199737d54 Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/lake-chat.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/model.png b/blogs/deepspeed-visualchat/assets/images/model.png new file mode 100644 index 000000000000..dbd1f05c484b Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/model.png differ diff --git a/blogs/deepspeed-visualchat/assets/images/zootopia.png b/blogs/deepspeed-visualchat/assets/images/zootopia.png new file mode 100644 index 000000000000..c9e3783ed198 Binary files /dev/null and b/blogs/deepspeed-visualchat/assets/images/zootopia.png differ diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp index f35760a99a5c..32b0e8a32394 100644 --- a/csrc/aio/common/deepspeed_aio_common.cpp +++ b/csrc/aio/common/deepspeed_aio_common.cpp @@ -115,10 +115,13 @@ static int _do_io_complete(const long long int min_completes, std::vector>& reap_times) { const auto start_time = std::chrono::high_resolution_clock::now(); - const auto n_completes = io_getevents( - aio_ctxt->_io_ctxt, min_completes, max_completes, aio_ctxt->_io_events.data(), nullptr); + long long int n_completes = io_pgetevents(aio_ctxt->_io_ctxt, + min_completes, + max_completes, + aio_ctxt->_io_events.data(), + nullptr, + nullptr); reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time); - assert(n_completes >= min_completes); return n_completes; } diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp index 055db8798a6b..e9c6a8505858 100644 --- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp +++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp @@ -24,7 +24,8 @@ io_op_desc_t::io_op_desc_t(const bool read_op, _num_bytes(num_bytes), _validate(validate) { - _cpu_buffer = _buffer.is_cuda() ? _buffer.to(torch::kCPU).pin_memory() : _buffer; + _cpu_buffer = (_buffer.is_cuda() || _buffer.is_xpu()) ? _buffer.to(torch::kCPU).pin_memory() + : _buffer; _contiguous_buffer = _cpu_buffer.contiguous(); } @@ -33,6 +34,7 @@ char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr void io_op_desc_t::fini() { if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); } + if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); } } deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config) diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp index 7fbf5f61392e..accf431f6929 100644 --- a/csrc/cpu/comm/ccl.cpp +++ b/csrc/cpu/comm/ccl.cpp @@ -499,6 +499,17 @@ void all_reduce_caching(torch::Tensor& data, .wait()); } +static void parallel_memcpy(void* to, void* from, size_t n_bytes) + __attribute__((target("avx512bw"))); +static void parallel_memcpy(void* to, void* from, size_t n_bytes) +{ +#pragma omp parallel for + for (int i = 0; i < n_bytes; i += VECTOR_LENGTH_IN_BYTES) { + auto val = _mm256_loadu_si256((__m256i*)((char*)from + i)); + _mm256_storeu_si256((__m256i*)((char*)to + i), val); + } +} + void inference_all_reduce(torch::Tensor& data, py::object op, py::object group, bool async_op) { static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp"); @@ -517,8 +528,7 @@ void inference_all_reduce(torch::Tensor& data, py::object op, py::object group, default: data_type_fallback = true; } - if (data_size > MAX_BUF_SIZE || data_type_fallback || - (data_size % VECTOR_LENGTH_IN_BYTES) != 0 || !all_ranks_local_p) { + if (data_type_fallback || (data_size % VECTOR_LENGTH_IN_BYTES) != 0 || !all_ranks_local_p) { // fallback to oneccl allreduce CCLCHECK(ccl::allreduce(data.data_ptr(), data.data_ptr(), @@ -530,42 +540,46 @@ void inference_all_reduce(torch::Tensor& data, py::object op, py::object group, return; } - auto data_ptr = data.data_ptr(); + for (int offset = 0; offset < data_size; offset += MAX_BUF_SIZE) { + auto data_ptr = ((char*)(data.data_ptr()) + offset); + size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset; + size_t chunk_el = chunk_size / (data_size / numel); - memcpy(workspace[world_rank].buffer, data_ptr, data_size); - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_allreduce_naive__copy_in_done; + parallel_memcpy(workspace[world_rank].buffer, data_ptr, chunk_size); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank].state = coll_allreduce_naive__copy_in_done; - if (world_rank == 0) { - // compute allreduce result on rank 0 - for (int i = 1; i < world_size; i++) { - // wait until the other rank copy the buffer - wait_buffer_state_until(i, coll_allreduce_naive__copy_in_done); + if (world_rank == 0) { + // compute allreduce result on rank 0 + for (int i = 1; i < world_size; i++) { + // wait until the other rank copy the buffer + wait_buffer_state_until(i, coll_allreduce_naive__copy_in_done); + } + reduce_all_buffers(workspace, chunk_el, data.scalar_type(), world_size); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank].state = coll_allreduce_naive__reduce_done; + parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size); } - reduce_all_buffers(workspace, numel, data.scalar_type(), world_size); - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_allreduce_naive__reduce_done; - memcpy(data_ptr, workspace[0].buffer, data_size); - } - if (world_rank != 0) { - wait_buffer_state_until(0, coll_allreduce_naive__reduce_done); - memcpy(data_ptr, workspace[0].buffer, data_size); - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_allreduce_naive__copy_out_done; - } - if (world_rank == 0) { - for (int i = 1; i < world_size; i++) { - wait_buffer_state_until(i, coll_allreduce_naive__copy_out_done); + if (world_rank != 0) { + wait_buffer_state_until(0, coll_allreduce_naive__reduce_done); + parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size); + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank].state = coll_allreduce_naive__copy_out_done; + } + if (world_rank == 0) { + for (int i = 1; i < world_size; i++) { + wait_buffer_state_until(i, coll_allreduce_naive__copy_out_done); + } + std::atomic_thread_fence(std::memory_order_release); + workspace[world_rank].state = coll_begin; + } + if (world_rank != 0) { + // if rank 0 spin too fast it could be in state 1 of next allreduce + // in this case wait_buffer_state_until(0, 0) may cause deadlock + // what we are certain is when rank 0 finishes the state won't be 2 + wait_buffer_state_until_not(0, coll_allreduce_naive__reduce_done); + workspace[world_rank].state = coll_begin; } - std::atomic_thread_fence(std::memory_order_release); - workspace[world_rank].state = coll_begin; - } - if (world_rank != 0) { - // if rank 0 spin too fast it could be in state 1 of next allreduce - // in this case wait_buffer_state_until(0, 0) may cause deadlock - // what we are certain is when rank 0 finishes the state won't be 2 - wait_buffer_state_until_not(0, coll_allreduce_naive__reduce_done); - workspace[world_rank].state = coll_begin; } } diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 6efa83179591..daf143919558 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -314,10 +314,11 @@ def _replace(self, child, name, conv_linear_layer): child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = child.weight.data.split( (weight_shape[0] if self.conv_linear_layer else weight_shape[1]) // self.mp_size, dim=1) - data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) + data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach() + del data setattr(child, "replaced", True) - return LinearAllreduce(torch.nn.parameter.Parameter(data, requires_grad=False), child.bias if child.bias is None else \ + return LinearAllreduce(torch.nn.parameter.Parameter(data_dc, requires_grad=False), child.bias if child.bias is None else \ torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), self.mp_group) else: @@ -330,27 +331,29 @@ def _replace(self, child, name, conv_linear_layer): #for detecting fused type module_str = str(self.module).strip() #The copy is a regular copy, The shape of dst and src is the same - data = prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index) + data_dc = prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index) - bias_data = None if child.bias is None else prepare_tp_fused_qkvw( + bias_data_dc = None if child.bias is None else prepare_tp_fused_qkvw( module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to( get_accelerator().current_device_name()) else: data = child.weight.data.split((weight_shape[0]) // self.mp_size, dim=1 if self.conv_linear_layer else 0) - data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) + data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach() + del data if child.bias is not None: bias_data = child.bias.data.split( (weight_shape[1] if self.conv_linear_layer else weight_shape[0]) // self.mp_size, dim=0) bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name()) - bias_data = torch.nn.parameter.Parameter(bias_data, requires_grad=False) + bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False) + del bias_data else: - bias_data = None + bias_data_dc = None setattr(child, "replaced", True) - return LinearLayer(weight=torch.nn.parameter.Parameter(data.to(get_accelerator().current_device_name()), requires_grad=False), \ - bias=bias_data) + return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc.to(get_accelerator().current_device_name()), requires_grad=False), \ + bias=bias_data_dc) def _slice_embedding(self, child, name, conv_linear_layer): if getattr(child, "replaced", False) == True: diff --git a/deepspeed/ops/transformer/inference/ds_mlp.py b/deepspeed/ops/transformer/inference/ds_mlp.py index b6638f98a0ea..36de06db920f 100644 --- a/deepspeed/ops/transformer/inference/ds_mlp.py +++ b/deepspeed/ops/transformer/inference/ds_mlp.py @@ -20,8 +20,8 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count self.config = config - data_type = torch.half if self.config.dtype == torch.int8 else self.config.dtype - data_type_fp = data_type + data_type = torch.int8 if self.config.dtype == torch.int8 else self.config.dtype + data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype device = get_accelerator().current_device_name() proj_factor = 2 if self.config.mlp_act_func_type in GATED_ACTIVATION_TYPES else 1 diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index ad95e1f7c8ad..a929ca5842b6 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -133,7 +133,7 @@ def initialize_optimizer_states(self): return - def zero_grad(self, set_to_none=False): + def zero_grad(self, set_to_none=True): """ Zero FP16 parameter grads. """ diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 9a61250d69f4..14271255df2e 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -110,7 +110,7 @@ def __init__(self, self.initialize_optimizer_states() - def zero_grad(self, set_to_none=False): + def zero_grad(self, set_to_none=True): """ Zero FP16 parameter grads. """ @@ -216,7 +216,7 @@ def step(self, closure=None): norm_group_value = get_weight_norm(grads_for_norm, mpu=self.mpu) norm_groups.append(norm_group_value) - # copying gradients to fp32 to wor k with fp32 parameters + # copying gradients to fp32 to work with fp32 parameters for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]): if fp16_param.grad is None: fp32_param.grad = torch.zeros(fp16_param.size(), dtype=fp32_param.dtype, device=fp32_param.device) diff --git a/deepspeed/runtime/swap_tensor/async_swapper.py b/deepspeed/runtime/swap_tensor/async_swapper.py index 152b5b70a515..b808721537fe 100644 --- a/deepspeed/runtime/swap_tensor/async_swapper.py +++ b/deepspeed/runtime/swap_tensor/async_swapper.py @@ -10,6 +10,7 @@ from deepspeed import comm as dist from deepspeed.utils.logging import logger from deepspeed.runtime.swap_tensor.utils import swap_out_tensors, SwapBuffer +from deepspeed.accelerator import get_accelerator INVALID_BUFFER_INDEX = -1 ASYNC_SWAPPER_WAIT_TIMER = 'async_swap_gradient_wait' @@ -37,7 +38,7 @@ def has_buffers(self): def add_buffers(self, buffer_list): assert len(self.all_buffers) == 0 - assert all([buffer.is_pinned() for buffer in buffer_list]) + assert all([get_accelerator().is_pinned(buffer) for buffer in buffer_list]) dtype = buffer_list[0].dtype assert all([buffer.dtype == dtype for buffer in buffer_list]) diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py index 12be256f8055..86e43c98e7e5 100644 --- a/deepspeed/runtime/swap_tensor/optimizer_utils.py +++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py @@ -15,6 +15,7 @@ from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, \ MIN_AIO_BYTES, AIO_ALIGNED_BYTES, get_sized_buffers from deepspeed.runtime.swap_tensor.utils import SwapBufferManager, SwapBufferPool +from deepspeed.accelerator import get_accelerator class FlattenedTensorSwapInfo(object): @@ -90,7 +91,7 @@ def get_swap_gradient_paths(self): return [grad.path for grad in self.swapped_gradients.values()] def get_unpinned_state_tensors(self): - return [t for t in self.tensors if not t.is_pinned()] + return [t for t in self.tensors if not get_accelerator().is_pinned(t)] def read_unswapped_gradients(self, dest_buffer): num_elem_count = 0 @@ -216,7 +217,7 @@ def _initialize_from_swapped_fp16_params(self, aio_handle, fp16_partitions_info, fp16_pinned_buffers, fp32_parameters): assert len(fp32_parameters) == len(fp16_partitions_info) assert len(fp32_parameters) == len(fp16_num_elems) - assert all([buffer.is_pinned() for buffer in fp16_pinned_buffers]) + assert all([get_accelerator().is_pinned(buffer) for buffer in fp16_pinned_buffers]) fp32_swap_paths = self._get_swap_paths(parameters=fp32_parameters, num_elems=fp16_num_elems) diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py index 677bc2aa4a8e..e7bf06043fd7 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py @@ -17,6 +17,7 @@ get_sized_buffers from deepspeed.runtime.swap_tensor.async_swapper import AsyncTensorSwapper from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper +from deepspeed.accelerator import get_accelerator DEBUG_MODE = False @@ -174,7 +175,7 @@ def _separate_pinned_tensors(self, swap_info): unpinned_paths = [] for tensor, path in zip(swap_info.tensors, swap_info.swap_paths): - if tensor.is_pinned(): + if get_accelerator().is_pinned(tensor): pinned_tensors.append(tensor) pinned_paths.append(path) else: @@ -206,7 +207,7 @@ def _swap_in_gradients(self, aio_handle, parameter, dest_buffer): if not (swap_info and swap_info.has_gradients()): return - assert dest_buffer.is_pinned() + assert get_accelerator().is_pinned(dest_buffer) assert parameter.numel() <= dest_buffer.numel() parameter.grad = dest_buffer.narrow(0, 0, parameter.numel()) diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py index 4109e0954148..fcc6a272883f 100644 --- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py +++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py @@ -104,10 +104,11 @@ def _configure_aio(self, ds_config): self.available_buffer_ids = [i for i in range(self.param_buffer_count)] self.reserved_buffer_ids = [] - self.buffers = get_accelerator().pin_memory( - torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count), - dtype=self.dtype, - requires_grad=False)) + self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer * + self.param_buffer_count), + dtype=self.dtype, + requires_grad=False), + align_bytes=0) self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH], self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS], @@ -313,7 +314,8 @@ def swap_in(self, params, async_op=True, swap_in_buffers=None): def swap_into_buffer(self, param, dest_buffer): assert param.ds_tensor.status == PartitionedParamStatus.NOT_AVAILABLE, f"param {param.ds_id} is already available or inflight" - require_swap_buffer = not (dest_buffer.is_pinned() and self._is_io_aligned(dest_buffer.numel())) + require_swap_buffer = not (get_accelerator().is_pinned(dest_buffer) + and self._is_io_aligned(dest_buffer.numel())) if require_swap_buffer: assert len(self.available_buffer_ids) > 0, f"No buffer available to swap param {param.ds_id}." @@ -378,8 +380,10 @@ def _is_io_aligned(self, numel): def reserve_partitioned_swap_space(self, partition_num_elems): aligned_numel = sum([self._io_aligned_numel(numel) for numel in partition_num_elems]) - self.partitioned_swap_buffer = get_accelerator().pin_memory( - torch.zeros(aligned_numel, device='cpu', dtype=self.dtype)) + self.partitioned_swap_buffer = get_accelerator().pin_memory(torch.zeros(aligned_numel, + device='cpu', + dtype=self.dtype), + align_bytes=0) self.partitioned_swap_pool = SwapBufferPool([self.partitioned_swap_buffer]) def swap_out_partitioned_params(self, dst_fp16_params, src_fp32_params): diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py index 50a88f74351a..90b2d9b8bd31 100644 --- a/deepspeed/runtime/swap_tensor/utils.py +++ b/deepspeed/runtime/swap_tensor/utils.py @@ -96,7 +96,7 @@ def get_swap_path(self, offset): class SwapBufferPool(object): def __init__(self, buffers): - assert all([buf.is_pinned() for buf in buffers]) + assert all([get_accelerator().is_pinned(buf) for buf in buffers]) self.buffers = [SwapBuffer(buf) for buf in buffers] self.current_index = 0 @@ -184,7 +184,8 @@ def __init__(self, num_elems, count, dtype): self.count = count self.dtype = dtype self.all_buffers = [ - get_accelerator().pin_memory(torch.zeros(num_elems, device='cpu', dtype=dtype)) for _ in range(count) + get_accelerator().pin_memory(torch.zeros(num_elems, device='cpu', dtype=dtype), align_bytes=0) + for _ in range(count) ] self.free_buffer_index = [i for i in range(count)] self.used_buffer_index = {} diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index 7024b93d6820..b066090265c8 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -394,7 +394,6 @@ def get_grad_norm(parameters, norm_type=2, mpu=None): Arguments: parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a single Tensor that will have gradients normalized - max_norm (float or int): max norm of the gradients norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. @@ -491,12 +490,12 @@ def get_weight_norm(parameters, norm_type=2, mpu=None): Arguments: parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a single Tensor that will have gradients normalized - max_norm (float or int): max norm of the gradients norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. Returns: Total norm of the parameters (viewed as a single vector). + -1 if the norm value is NaN or Inf. """ if isinstance(parameters, torch.Tensor): parameters = [parameters] diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py index f4407f64e424..1fc11f0e46f5 100644 --- a/deepspeed/runtime/zero/config.py +++ b/deepspeed/runtime/zero/config.py @@ -236,7 +236,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel): Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks. This controls whether or not training should terminate with an error message when unused parameters are detected. - This is set to ``False`` by default, which means unused parameters are + This is set to ``True`` by default, which means unused parameters are ignored and training continues. Now is just used in stage 2. """ diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 9509b5a692ca..ed44af08a686 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1569,7 +1569,7 @@ def get_partition_info(self, tensor_list, partition_size, partition_id): return params_in_partition, params_not_in_partition, first_offset @instrument_w_nvtx - def zero_grad(self, set_to_none=False): + def zero_grad(self, set_to_none=True): """ Zero FP16 parameter grads. """ diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index d64eb1ace5b5..8e5fb8b61c37 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1522,7 +1522,7 @@ def get_partition_info(self, tensor_list, partition_size, partition_id): return params_in_partition, params_not_in_partition, first_offset - def zero_grad(self, set_to_none=False): + def zero_grad(self, set_to_none=True): """ Zero FP16 parameter grads. """ diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md index e6ad26ca67fd..cdd18e62a29e 100755 --- a/docs/_pages/config-json.md +++ b/docs/_pages/config-json.md @@ -427,6 +427,12 @@ Enabling and configuring ZeRO memory optimizations | ------------------------------------------------------------------------------------------------------------------- | ------- | | Copies the gradients to a contiguous buffer as they are produced. Avoids memory fragmentation during backward pass. | `True` | +**load_from_fp32_weights**: [boolean] + +| Description | Default | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------- | +| Initialize fp32 master weights from fp32 copies in checkpoint (no precision loss) or from model's fp16 copies (with precision loss). This can be used to initialize optimizer state even when checkpoint is missing optimizer state. | `True` | + **grad_hooks**: [boolean] | Description | Default | diff --git a/docs/index.md b/docs/index.md index 210e1494f7e2..79fd6baae250 100755 --- a/docs/index.md +++ b/docs/index.md @@ -7,6 +7,7 @@ title: "Latest News" --- DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat). +* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) * [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](/deepspeed4science/)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)] * [2023/08] [DeepSpeed ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) * [2023/08] [DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md) @@ -135,6 +136,7 @@ comments. 24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847) 25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320). 26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) +27. Zhewei Yao, Xiaoxia Wu, Conglong Li, Minjia Zhang, Heyang Qin, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He. (2023) DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention [arXiv:2309.14327](https://arxiv.org/pdf/2309.14327.pdf) # Videos 1. DeepSpeed KDD 2020 Tutorial diff --git a/op_builder/async_io.py b/op_builder/async_io.py index 084cb10864cf..2db18e3629a1 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -79,7 +79,7 @@ def is_compatible(self, verbose=True): # which is a function provided by libaio that is used in the async_io op. # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS # respectively to specify the directories for libaio.h and libaio.so. - aio_compatible = self.has_function('io_submit', ('aio', )) + aio_compatible = self.has_function('io_pgetevents', ('aio', )) if verbose and not aio_compatible: self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.") diff --git a/release/check_release_version.py b/release/check_release_version.py index abf1e403f318..148fa8aa3c42 100644 --- a/release/check_release_version.py +++ b/release/check_release_version.py @@ -8,22 +8,13 @@ parser = argparse.ArgumentParser() -parser.add_argument("--new_version", type=str, help="The new version being published.") +parser.add_argument("--release_version", type=str, help="The new version being published.") args = parser.parse_args() -new_version = pkg_version.parse(args.new_version) +release_version = pkg_version.parse(args.release_version) with open('./version.txt') as fd: - current_version = pkg_version.parse(fd.read()) + repo_version = pkg_version.parse(fd.read()) -# Valid version are those where the major/minor/micro are incremented by no more than one from the existing release, and the less significant values are reset to 0. -valid_major_update = pkg_version.Version(f'{current_version.major + 1}.0.0') -valid_minor_update = pkg_version.Version(f'{current_version.major}.{current_version.minor + 1}.0') -valid_micro_update = pkg_version.Version( - f'{current_version.major}.{current_version.minor}.{current_version.micro + 1}') - -valid_versions = [valid_major_update, valid_minor_update, valid_micro_update] - -if new_version not in valid_versions: - raise Exception(f'{new_version} is an invalid version. Valid versions are {valid_versions}.\n') +assert repo_version == release_version, f"{repo_version=} does not match {release_version=}, unable to proceed" diff --git a/release/release.sh b/release/release.sh index a4d38674d48c..a83fafcb9b1f 100644 --- a/release/release.sh +++ b/release/release.sh @@ -26,7 +26,7 @@ if [ "${version}" != `cat version.txt` ]; then fi echo "checking that the version is valid" -python release/check_release_version.py --new_version ${version} +python release/check_release_version.py --release_version ${version} if [ $? != 0 ]; then echo 'please check the version number selected' exit 1 diff --git a/scripts/check-license.py b/scripts/check-license.py index 67caa30a3e3f..e5d5792d06b6 100755 --- a/scripts/check-license.py +++ b/scripts/check-license.py @@ -19,20 +19,24 @@ def err(s: str) -> None: COPYRIGHT = [ - r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$", r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$", - r"^\(\/\/\|#\) DeepSpeed Team$" + (r"^# Copyright (c) Microsoft Corporation.$", r"^\/\/ Copyright (c) Microsoft Corporation.$"), + (r"^# SPDX-License-Identifier: Apache-2.0$", r"^\/\/ SPDX-License-Identifier: Apache-2.0$"), + (r"^# DeepSpeed Team$", r"^\/\/ DeepSpeed Team$"), ] success = True failures = [] for f in sys.argv[1:]: for copyright_line in COPYRIGHT: - if not success: - break - res = subprocess.run(["git", "grep", "--quiet", "-e", copyright_line, f], capture_output=True) + cmd = ["git", "grep", "--quiet"] + for line in copyright_line: + cmd.extend(["-e", line]) + cmd.append(f) + res = subprocess.run(cmd, capture_output=True) if res.returncode == 1: success = False failures.append(f) + break elif res.returncode == 2: err(f"Error invoking grep on {', '.join(sys.argv[1:])}:") err(res.stderr.decode("utf-8"))