diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py index 0a57d9724..b0ce3d99b 100644 --- a/QEfficient/cloud/finetune.py +++ b/QEfficient/cloud/finetune.py @@ -65,9 +65,9 @@ def main(**kwargs): # TODO: may have to init qccl backend, next try run with torchrun command torch_device = torch.device(device) assert torch_device.type != "cpu", "Host doesn't support single-node DDP" - assert ( - torch_device.index is None - ), f"DDP requires specification of device type only, however provided device index as well: {torch_device}" + assert torch_device.index is None, ( + f"DDP requires specification of device type only, however provided device index as well: {torch_device}" + ) dist.init_process_group(backend=train_config.dist_backend) # from here onward "qaic/cuda" will automatically map to "qaic:i/cuda:i", where i = process rank getattr(torch, torch_device.type).set_device(dist.get_rank()) diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py index 1289934e7..fe5493978 100644 --- a/QEfficient/finetune/utils/train_utils.py +++ b/QEfficient/finetune/utils/train_utils.py @@ -96,7 +96,7 @@ def train( # Start the training loop for epoch in range(train_config.num_epochs): - print(f"Starting epoch {epoch+1}/{train_config.num_epochs}") + print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}") print(f"train_config.max_train_step: {train_config.max_train_step}") # stop when the maximum number of training steps is reached if max_steps_reached: @@ -108,7 +108,7 @@ def train( total_length = len(train_dataloader) // train_config.gradient_accumulation_steps pbar = tqdm( colour="blue", - desc=f"Training Epoch: {epoch+1}", + desc=f"Training Epoch: {epoch + 1}", total=total_length, dynamic_ncols=True, ) @@ -123,9 +123,9 @@ def train( break batch = {k: v.to(device) for k, v in batch.items()} # move the batch elements to qaic device - with torch.autocast( - device_type=device, dtype=torch.float16 - ) if train_config.use_autocast else nullcontext(): + with ( + torch.autocast(device_type=device, dtype=torch.float16) if train_config.use_autocast else nullcontext() + ): # an additional condition can be put here to avoid opByOpVerifier getting triggered for each step if train_config.opByOpVerifier: with qaic_debug.OpByOpVerifierMode( @@ -183,7 +183,7 @@ def train( model.save_pretrained(train_config.output_dir + f"/trained_weights/step_{step}") pbar.set_description( - f"Training Epoch: {epoch+1}/{train_config.num_epochs}, step {step+1}/{len(train_dataloader)} completed (loss: {loss.detach().float()})" + f"Training Epoch: {epoch + 1}/{train_config.num_epochs}, step {step + 1}/{len(train_dataloader)} completed (loss: {loss.detach().float()})" ) if train_config.save_metrics: save_to_json( @@ -244,11 +244,11 @@ def train( if train_config.run_validation: if eval_epoch_loss < best_val_loss: best_val_loss = eval_epoch_loss - print(f"best eval loss on epoch {epoch+1} is {best_val_loss}") + print(f"best eval loss on epoch {epoch + 1} is {best_val_loss}") val_loss.append(float(eval_epoch_loss)) val_prep.append(float(eval_ppl)) print( - f"Epoch {epoch+1}: train_perplexity={train_perplexity:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" + f"Epoch {epoch + 1}: train_perplexity={train_perplexity:.4f}, train_epoch_loss={train_epoch_loss:.4f}, epoch time {epoch_end_time}s" ) # Saving the results every epoch to plot later @@ -322,9 +322,9 @@ def evaluation(model, train_config, eval_dataloader, local_rank, tokenizer, devi # Ensure no gradients are computed for this scope to save memory with torch.no_grad(): # Forward pass and compute loss - with torch.autocast( - device_type=device, dtype=torch.float16 - ) if train_config.use_autocast else nullcontext(): + with ( + torch.autocast(device_type=device, dtype=torch.float16) if train_config.use_autocast else nullcontext() + ): outputs = model(**batch) loss = outputs.loss diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index c9f17a73e..d1bb02a29 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -48,7 +48,7 @@ pipeline { } stage('Run Non-CLI QAIC Tests') { steps { - timeout(time: 60, unit: 'MINUTES') { + timeout(time: 70, unit: 'MINUTES') { sh ''' sudo docker exec ${BUILD_TAG} bash -c " cd /efficient-transformers && @@ -56,7 +56,7 @@ pipeline { mkdir -p $PWD/Non_qaic && export TOKENIZERS_PARALLELISM=false && export QEFF_HOME=$PWD/Non_qaic && - pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 4 --junitxml=tests/tests_log2.xml && + pytest tests -m '(not cli) and (on_qaic) and (not qnn)' -n 3 --junitxml=tests/tests_log2.xml && deactivate" ''' } diff --git a/scripts/perplexity_computation/calculate_perplexity.py b/scripts/perplexity_computation/calculate_perplexity.py index 9e24b2c66..d33f20822 100644 --- a/scripts/perplexity_computation/calculate_perplexity.py +++ b/scripts/perplexity_computation/calculate_perplexity.py @@ -200,7 +200,7 @@ def torch_perplexity( loop_time = time.time() - loop_s logger.info( - f"E2E Sample Time: {(loop_time)/batch_size:.4f}s\t E2E TOKENS/S : {((ctx_len-prompt_len)*batch_size)/loop_time:.2f}" + f"E2E Sample Time: {(loop_time) / batch_size:.4f}s\t E2E TOKENS/S : {((ctx_len - prompt_len) * batch_size) / loop_time:.2f}" ) del outputs @@ -332,7 +332,7 @@ def calculate_perplexity( loop_time = time.time() - loop_s logger.info( - f"e2e sample time: {(loop_time)/batch_size:.4f}s\t e2e tokens/s : {((ctx_len-prompt_len)*batch_size)/loop_time:.2f}" + f"e2e sample time: {(loop_time) / batch_size:.4f}s\t e2e tokens/s : {((ctx_len - prompt_len) * batch_size) / loop_time:.2f}" ) avg_loss = total_loss / total_tokens @@ -415,7 +415,7 @@ def main(): print(f"Dataset Stride: {args.stride}", file=fp) print(f"Overall Loss: {loss}", file=fp) print(f"Perplexity: {perplexity}", file=fp) - print(f"Total time for evaluation: {(time.time()-start_time)/3600.0} hrs", file=fp) + print(f"Total time for evaluation: {(time.time() - start_time) / 3600.0} hrs", file=fp) if isinstance(args.model_type, str) and args.model_type == "torch": print("\n*******************************************************", file=fp) print(f"Torch Original Perplexity: {perplexity}", file=fp) diff --git a/tests/base/test_onnx_transforms.py b/tests/base/test_onnx_transforms.py index dbbbbda1f..8bc622e00 100644 --- a/tests/base/test_onnx_transforms.py +++ b/tests/base/test_onnx_transforms.py @@ -77,9 +77,9 @@ def test_split_tensors_transform(tmp_path): > test_split () => () < - float[1, 32] tensor0 = [ "location": "{external_tensors_file}", "offset": "0", "length": "{32*4}" ], - float[1, 32] tensor1 = [ "location": "{external_tensors_file}", "offset": "{32*4}", "length": "{32*4}" ], - float[1, 16] tensor2 = [ "location": "{external_tensors_file}", "offset": "{64*4}", "length": "{16*4}" ] + float[1, 32] tensor0 = [ "location": "{external_tensors_file}", "offset": "0", "length": "{32 * 4}" ], + float[1, 32] tensor1 = [ "location": "{external_tensors_file}", "offset": "{32 * 4}", "length": "{32 * 4}" ], + float[1, 16] tensor2 = [ "location": "{external_tensors_file}", "offset": "{64 * 4}", "length": "{16 * 4}" ] > {{ }} diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py index 50ad3551d..fe906fe7e 100644 --- a/tests/qnn_tests/test_causal_lm_models_qnn.py +++ b/tests/qnn_tests/test_causal_lm_models_qnn.py @@ -86,9 +86,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - assert ( - pytorch_hf_tokens == pytorch_kv_tokens - ).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output" + assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( + "Tokens don't match for HF PyTorch model output and KV PyTorch model output" + ) onnx_model_path = qeff_model.export() ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path) @@ -109,9 +109,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size gen_len = ort_tokens.shape[-1] - assert ( - ort_tokens == cloud_ai_100_tokens[:, :gen_len] - ).all(), "Tokens don't match for ONNXRT output and Cloud AI 100 output." + assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( + "Tokens don't match for ONNXRT output and Cloud AI 100 output." + ) # testing for CB models model_hf, _ = load_causal_lm_model(model_config) diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py index f98e5af4b..15f4b7dcb 100644 --- a/tests/text_generation/test_text_generation.py +++ b/tests/text_generation/test_text_generation.py @@ -98,6 +98,6 @@ def test_generate_text_stream( for decoded_tokens in text_generator.generate_stream_tokens(Constants.INPUT_STR, generation_len=max_gen_len): stream_tokens.extend(decoded_tokens) - assert ( - cloud_ai_100_output == stream_tokens - ), f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}" + assert cloud_ai_100_output == stream_tokens, ( + f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}" + ) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 6e91711e0..8f23fac89 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -110,9 +110,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - assert ( - pytorch_hf_tokens == pytorch_kv_tokens - ).all(), "Tokens don't match for HF PyTorch model output and KV PyTorch model output" + assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( + "Tokens don't match for HF PyTorch model output and KV PyTorch model output" + ) onnx_model_path = qeff_model.export() ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm) @@ -133,9 +133,9 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size gen_len = ort_tokens.shape[-1] - assert ( - ort_tokens == cloud_ai_100_tokens[:, :gen_len] - ).all(), "Tokens don't match for ONNXRT output and Cloud AI 100 output." + assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( + "Tokens don't match for ONNXRT output and Cloud AI 100 output." + ) # testing for CB models model_hf, _ = load_causal_lm_model(model_config) @@ -204,9 +204,9 @@ def test_causal_lm_export_with_deprecated_api(model_name): new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path) old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path) - assert ( - new_api_ort_tokens == old_api_ort_tokens - ).all(), "New API output does not match old API output for ONNX export function" + assert (new_api_ort_tokens == old_api_ort_tokens).all(), ( + "New API output does not match old API output for ONNX export function" + ) @pytest.mark.on_qaic diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py index 6e1b70f79..2e5f55cc7 100644 --- a/tests/transformers/spd/test_spd_inference.py +++ b/tests/transformers/spd/test_spd_inference.py @@ -74,9 +74,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int): """ num_chunks = -(input_len // -prefill_seq_len) # ceil divide without float input_len_padded = num_chunks * prefill_seq_len # Convert input_len to a multiple of prefill_seq_len - assert ( - input_len_padded <= ctx_len - ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len" + assert input_len_padded <= ctx_len, ( + "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len" + ) return input_len_padded @@ -325,9 +325,9 @@ def test_spec_decode_inference( for prompt, generation in zip(prompts, batch_decode): print(f"{prompt=} {generation=}") # validation check - assert mean_num_accepted_tokens == float( - num_speculative_tokens + 1 - ), f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens+1}" + assert mean_num_accepted_tokens == float(num_speculative_tokens + 1), ( + f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}" + ) del target_model_session del draft_model_session generated_ids = np.asarray(generated_ids).flatten() diff --git a/tests/transformers/test_transformer_pytorch_transforms.py b/tests/transformers/test_transformer_pytorch_transforms.py index e6a7d4588..d7151aad7 100644 --- a/tests/transformers/test_transformer_pytorch_transforms.py +++ b/tests/transformers/test_transformer_pytorch_transforms.py @@ -320,9 +320,9 @@ def test_awq_to_matmulnbits_transform(in_features, out_features): assert transformed new_out = new_module(rand_data) assert isinstance(new_module, QuantLinearORT) - assert compare_original_vs_kv_model_pt_outputs( - old_out, new_out, tolerance=1e-8 - ), "Test failed because MAE is greater than tolerance" + assert compare_original_vs_kv_model_pt_outputs(old_out, new_out, tolerance=1e-8), ( + "Test failed because MAE is greater than tolerance" + ) @pytest.mark.parametrize("in_features", [4096, 4096]) @@ -349,6 +349,6 @@ def test_gptq_to_matmulnbits_transform(in_features, out_features): assert transformed new_out = new_module(rand_data) assert isinstance(new_module, QuantLinearORT) - assert compare_original_vs_kv_model_pt_outputs( - old_out, new_out, tolerance=1e-4 - ), "Test failed because MAE is greater than tolerance" + assert compare_original_vs_kv_model_pt_outputs(old_out, new_out, tolerance=1e-4), ( + "Test failed because MAE is greater than tolerance" + )