From f5e5e4e386ec2ebe94b3043606fd59a12452981e Mon Sep 17 00:00:00 2001
From: Peter McAughan <petermca@microsoft.com>
Date: Tue, 7 Nov 2023 23:45:36 +0000
Subject: [PATCH 1/9] Fix for CUDAExecutionProvider

---
 .../cpu/transformers/beam_search_parameters.cc      | 13 ++++++++++++-
 .../contrib_ops/cuda/transformers/beam_search.cc    |  1 +
 onnxruntime/core/graph/contrib_ops/contrib_defs.cc  |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 3962486d5b5eb..6e6a3e616e6cc 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -123,8 +123,19 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   logits_processor = logits_processor_tensor ? static_cast<int>(*logits_processor_tensor->Data<int32_t>()) : 0;
   ORT_ENFORCE(logits_processor >= 0,
               "logits_processor shall be a non-negative integer, got ", logits_processor);
-}
 
+  auto* temperature_tensor = context->Input<Tensor>(14);
+  auto* temperature_tensor = context->Input<Tensor>(6);
+  if (temperature_tensor) {
+    if (temperature_tensor->IsDataType<float>()) {
+      temperature = *temperature_tensor->Data<float>();
+    } else {
+      temperature = static_cast<float>(*temperature_tensor->Data<MLFloat16>());
+    }
+  } else {
+    temperature = 1.0f;
+  }
+}
 void BeamSearchParameters::SetSubgraphParameters(int vocabulary_size, int heads, int hidden_size_per_head, int layers) {
   // Override vocab_size using the inferred shape from the decoder subgraph ONLY IF
   // the vocab_size hasn't been explicitly specified by the user (as an attribute of BeamSearch)
diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc b/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc
index 2a90e4911f286..08cbb145a6f65 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search.cc
@@ -49,6 +49,7 @@ ONNX_OPERATOR_KERNEL_EX(
         .InputMemoryType(OrtMemTypeCPUInput, 9)    // 'attention_mask' needs to be on CPU
         .InputMemoryType(OrtMemTypeCPUInput, 10)   // 'decoder_input_ids' needs to be on CPU
         .InputMemoryType(OrtMemTypeCPUInput, 11)   // 'logits_processor' needs to be on CPU
+        .InputMemoryType(OrtMemTypeCPUInput, 14)   // 'temperature' needs to be on CPU
         .OutputMemoryType(OrtMemTypeCPUOutput, 0)  // 'sequences' output on CPU
         .OutputMemoryType(OrtMemTypeCPUOutput, 1)  // 'sequences_scores' output on CPU
         .TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 39449bea6303a..49bb657f68cc0 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1231,6 +1231,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                        "In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) "
                                        "are treated as stop of the extra_decoding_ids for corresponding batch.",
                                        "I", OpSchema::Optional)
+				                        .Input(14, "temperature", "Temperature value to apply to logit processing during this execution's decoding. Shape is (1)", "V", OpSchema::Optional)
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, num_return_sequences, max_sequence_length)", "I")
                                 .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "T", OpSchema::Optional)
                                 .Output(2, "scores",

From d989904cc9009c13de888cd28b1ed58d6389e747 Mon Sep 17 00:00:00 2001
From: Peter McAughan <petermca@microsoft.com>
Date: Wed, 13 Dec 2023 18:18:58 +0000
Subject: [PATCH 2/9] Debugging

---
 include/onnxruntime/core/framework/tensor.h            |  8 ++++----
 .../cpu/transformers/beam_search_impl_whisper.h        |  8 ++++++++
 .../cpu/transformers/beam_search_parameters.cc         |  3 ++-
 .../contrib_ops/cpu/transformers/beam_search_scorer.cc | 10 +++++++++-
 .../contrib_ops/cpu/transformers/logits_processor.h    |  2 +-
 .../cuda/transformers/generation_cuda_impl.cu          |  1 -
 onnxruntime/core/graph/contrib_ops/contrib_defs.cc     |  7 ++++---
 onnxruntime/core/session/inference_session.cc          |  1 +
 8 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index a867ab6066485..c4e773ebe8f6b 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -186,8 +186,8 @@ class Tensor final {
   template <typename T>
   T* MutableData() {
     // Type check
-    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
-                "T ", "!=", dtype_);
+    //ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+    //           "T ", "!=", dtype_);
     return reinterpret_cast<T*>(static_cast<char*>(p_data_) + byte_offset_);
   }
 
@@ -197,8 +197,8 @@ class Tensor final {
   template <typename T>
   gsl::span<T> MutableDataAsSpan() {
     // Type check
-    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
-                "T ", "!=", dtype_);
+    //ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+     //           "T ", "!=", dtype_);
     T* data = reinterpret_cast<T*>(static_cast<char*>(p_data_) + byte_offset_);
     return gsl::make_span(data, static_cast<size_t>(shape_.Size()));
   }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
index 91b93a125ad7a..c9fbc5861583f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -464,6 +464,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     }
   }
 
+  std::cout << "Done with search!" << std::endl;
   if (decoder_subgraph_.output_cross_qk_) {
     TensorShape cross_qk_shape{
         static_cast<int64_t>(parameters->batch_size),
@@ -493,12 +494,18 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
         beam_indices));
   }
 
+  std::cout << "Done with QK!..." << std::endl;
   gsl::span<const float> final_beam_scores = beam_state.beam_scores;
+  std::cout << "final_beam_scores" << std::endl;
+  std::cout << "float?" << output_sequences_scores->IsDataType<float>()<< std::endl;
+  std::cout << "float16?" << output_sequences_scores->IsDataType<MLFloat16>() << std::endl;
   this->beam_scorer_->Finalize(cpu_state.sequences,
                                final_beam_scores,
                                output_sequences,
                                output_sequences_scores);
 
+  std::cout << "Outputting scores..." << std::endl;
+
   // Output per token scores
   if (output_scores) {
     gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
@@ -507,6 +514,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
   }
 
+  std::cout << "Outputting scores!" << std::endl;
   return status;
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 6e6a3e616e6cc..4de031a9f9e83 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -125,7 +125,6 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
               "logits_processor shall be a non-negative integer, got ", logits_processor);
 
   auto* temperature_tensor = context->Input<Tensor>(14);
-  auto* temperature_tensor = context->Input<Tensor>(6);
   if (temperature_tensor) {
     if (temperature_tensor->IsDataType<float>()) {
       temperature = *temperature_tensor->Data<float>();
@@ -135,6 +134,8 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   } else {
     temperature = 1.0f;
   }
+  //auto* temperature_tensor = context->Input<Tensor>(14);
+  //temperature = temperature_tensor ? *temperature_tensor->Data<float>() : 1.0f;
 }
 void BeamSearchParameters::SetSubgraphParameters(int vocabulary_size, int heads, int hidden_size_per_head, int layers) {
   // Override vocab_size using the inferred shape from the decoder subgraph ONLY IF
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index 7e2e5b2129221..e01391c7fa4fd 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -13,6 +13,8 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 #include "contrib_ops/cpu/transformers/beam_search_scorer.h"
+#include <stdio.h>
+#include <iostream>
 
 namespace onnxruntime {
 namespace contrib {
@@ -187,13 +189,15 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
                                 Tensor* output_sequence_scores) {
   ORT_ENFORCE(output_sequences != nullptr);
 
+
   // Finalize all open beam hypotheses and add to generated hypotheses.
+  std::cout << "here1" << std::endl;
   for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
     BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
     if (beam_hyp.done_) {
       continue;
     }
-
+    std::cout << "here2" << std::endl;
     for (size_t beam_index = 0; beam_index < num_beams_; beam_index++) {
       size_t batch_beam_index = batch_index * num_beams_ + beam_index;
       float final_score = final_beam_scores[batch_beam_index];
@@ -208,12 +212,15 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
   // Fill output sequences with pad token ID so that we do not need append it later.
   std::fill_n(output.data(), output.size(), pad_token_id_);
 
+  std::cout << "here3" << std::endl;
   // Score of each sequence, with shape (batch_size * num_return_sequences).
+  //gsl::span<float> sequence_scores;
   gsl::span<float> sequence_scores;
   if (output_sequence_scores) {
     sequence_scores = output_sequence_scores->MutableDataAsSpan<float>();
   }
 
+  std::cout << "here4" << std::endl;
   // Select the best hypotheses according to number of sequences to return.
   for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
     BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
@@ -227,6 +234,7 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
     beam_hyp.Output(narrow<int>(num_return_sequences_), narrow<int>(max_length_), batch_output,
                     sequence_scores_buffer);
   }
+  std::cout << "here5" << std::endl;
 }
 
 }  // namespace transformers
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
index 4688ff272cee9..43a449520dfaa 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
@@ -10,7 +10,7 @@
 #include "contrib_ops/cpu/transformers/greedy_search_parameters.h"
 #include "contrib_ops/cpu/transformers/sampling_parameters.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
-
+#include <stdio.h>
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index dbd7fb010462d..0f3bc25d0ea8e 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -8,7 +8,6 @@
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include "contrib_ops/cuda/bert/utils.cuh"
 #include "contrib_ops/cuda/transformers/generation_cuda_impl.h"
-
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 49bb657f68cc0..6cdc759eb1641 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1231,14 +1231,14 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                        "In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) "
                                        "are treated as stop of the extra_decoding_ids for corresponding batch.",
                                        "I", OpSchema::Optional)
-				                        .Input(14, "temperature", "Temperature value to apply to logit processing during this execution's decoding. Shape is (1)", "V", OpSchema::Optional)
+				 .Input(14, "temperature", "Temperature value to apply to logit processing during this execution's decoding. Shape is (1)", "V", OpSchema::Optional)
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, num_return_sequences, max_sequence_length)", "I")
-                                .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "T", OpSchema::Optional)
+                                .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "L", OpSchema::Optional)
                                 .Output(2, "scores",
                                         "Processed beam scores for each vocabulary token at each generation step."
                                         "Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam."
                                         "Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)",
-                                        "T", OpSchema::Optional)
+                                        "L", OpSchema::Optional)
                                 .Output(3, "cross_qk",
                                         "Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, "
                                         "F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers,"
@@ -1251,6 +1251,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                         "The prob is save before logits may be updated by extra-decoding-ids. The shape of non_speech_probs is [B]",
                                         "T", OpSchema::Optional)
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain to float tensors.")
+                                .TypeConstraint("L", {"tensor(float)", "tensor(float16)"}, "Constrain to float tensors.")
                                 .TypeConstraint("F", {"tensor(float)", "tensor(int32)", "tensor(float16)"}, "Constrain input type to float or int tensors.")
                                 .TypeConstraint("I", {"tensor(int32)"}, "Constrain to integer types")
                                 .TypeConstraint("M", {"tensor(int32)"}, "Constrain mask to integer types")
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 1163be27b1685..240cad213fbca 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1911,6 +1911,7 @@ const DataTransferManager& InferenceSession::GetDataTransferManager() const {
 
 common::Status InferenceSession::CheckShapes(const std::string& input_output_name, const TensorShape& input_output_shape,
                                              const TensorShape& expected_shape, const char* input_output_moniker) const {
+
   const auto shape_size = input_output_shape.NumDimensions();
   const auto expected_shape_size = expected_shape.NumDimensions();
   if (shape_size != expected_shape_size) {

From 676b6f078d0ba692a4ee141782b65ef18d134a26 Mon Sep 17 00:00:00 2001
From: Peter McAughan <petermca@microsoft.com>
Date: Wed, 13 Dec 2023 23:35:12 +0000
Subject: [PATCH 3/9] Debugging dump

---
 .../cpu/transformers/beam_search_impl_base.h  |  5 +++
 .../transformers/beam_search_impl_whisper.h   | 15 +++++++-
 .../transformers/beam_search_parameters.cc    |  2 ++
 .../cpu/transformers/beam_search_scorer.cc    |  1 +
 .../transformers/generation_device_helper.cc  |  7 ++--
 .../cpu/transformers/logits_processor.cc      |  4 ++-
 .../cuda/transformers/generation_cuda_impl.cu | 29 +++++++++++++--
 .../cuda/transformers/generation_cuda_impl.h  |  3 +-
 .../transformers/generation_device_helper.cc  | 35 +++++++++++++++++--
 9 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
index 29b38fc234de5..9ca14a334cc0c 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
@@ -247,6 +247,7 @@ Status BeamSearchBase<T>::ProcessLogits(
     BeamSearchCpuState& cpu_state,
     AllocatorPtr& allocator,
     int counter) {
+  std::cout << "Processing logits?" << std::endl;
   return process_logits_func_(logits, &beam_state, &(cpu_state.sequences), allocator,
                               thread_pool_, &logits_processors_, beam_scorer_.get(),
                               parameters_, counter, ort_stream_, GetConsoleDumper());
@@ -260,7 +261,9 @@ Status BeamSearchBase<T>::GenerateNextToken(
     BeamSearchCpuState& cpu_state,
     int counter) {
   // Process logits to get next token scores
+  std::cout << "Processing Logits!" << std::endl;
   ORT_RETURN_IF_ERROR(ProcessLogits(logits, beam_state, cpu_state, temp_space_allocator_, counter));
+  std::cout << "Getting next scores!" << std::endl;
 
   if (this->IsCuda()) {
     auto beam_scores = beam_scorer_->GetNextScores();
@@ -274,6 +277,8 @@ Status BeamSearchBase<T>::GenerateNextToken(
 
     beam_next_tokens = beam_scorer_->GetNextTokens();
 
+  std::cout << "Got next scores" << std::endl;
+
 #ifdef DEBUG_GENERATION
     auto beam_indices = beam_scorer_->GetNextIndicesGPU();
     cuda_dumper_->Print("beam_scores from scorer", beam_state.beam_scores.data(), parameters_->batch_size, parameters_->num_beams);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
index c9fbc5861583f..d2baefc9800c2 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -114,6 +114,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
 
   const BeamSearchParameters* parameters = this->parameters_;
 
+  std::cout << "Executing Encoder" << std::endl;
   // Allocate output tensors.
   int64_t sequences_dims[] = {parameters->batch_size, parameters->num_return_sequences, parameters->max_length};
   TensorShape sequences_shape(&sequences_dims[0], sizeof(sequences_dims) / sizeof(sequences_dims[0]));
@@ -201,10 +202,13 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
   }
 #endif
 
+std::cout << "Done with encoder!" << std::endl;
+
   // ------------------------------------
   // Initialize resources
   // ------------------------------------
 
+std::cout << "Initializing beam_State" << std::endl;
   BeamSearchState<T> beam_state{*parameters,
                                 this->temp_space_allocator_,
                                 decoder_subgraph_.has_decoder_masked_attention_,
@@ -237,6 +241,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
   // ------------------------------------------------------------------------------
   // Generate next token from logits output from encoder, and initialize decoder inputs.
   // ------------------------------------------------------------------------------
+  std::cout << "Initialized beam_State!" << std::endl;
   gsl::span<int32_t> beam_next_tokens;
 
   int iteration_counter = 0;
@@ -255,6 +260,8 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
 
   std::vector<OrtValue> decoder_fetches;
 
+  std::cout << "Executing search!" << std::endl;
+
   if (current_length + 1 < parameters->max_length) {
     ++iteration_counter;
     ORT_RETURN_IF_ERROR(this->GenerateNextToken(encoder_fetches[0],
@@ -263,7 +270,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
                                                 cpu_state,
                                                 iteration_counter));
     ++current_length;  // Increase sequence length after a new token is generated.
-
+    std::cout << "1" << std::endl;
     ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(this->cpu_allocator_,
                                                              ReinterpretAsSpan<const int32_t>(beam_next_tokens),
                                                              this->implicit_inputs_,
@@ -280,6 +287,7 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
                                                              cpu_state.sequences,
                                                              parameters->max_length,
                                                              decoder_subgraph_.has_decoder_masked_attention_));
+    std::cout << "2" << std::endl;
 
     if (decoder_subgraph_.past_present_share_buffer_) {
       decoder_fetches.reserve(static_cast<size_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) +
@@ -296,7 +304,10 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
       }
     }
 
+    std::cout << "3" << std::endl;
+
     if (decoder_subgraph_.output_cross_qk_) {
+      std::cout << "QK" << std::endl;
       ORT_ENFORCE(decoder_subgraph_.has_decoder_masked_attention_, "decoder subgraph: output_cross_qk could only work with has_decoder_masked_attention");
       ORT_ENFORCE(decoder_subgraph_.past_present_share_buffer_, "decoder subgraph: output_cross_qk could only work with past_present_share_buffer");
 
@@ -499,6 +510,8 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
   std::cout << "final_beam_scores" << std::endl;
   std::cout << "float?" << output_sequences_scores->IsDataType<float>()<< std::endl;
   std::cout << "float16?" << output_sequences_scores->IsDataType<MLFloat16>() << std::endl;
+  std::cout << "Output Scores?" << output_scores << std::endl;
+  
   this->beam_scorer_->Finalize(cpu_state.sequences,
                                final_beam_scores,
                                output_sequences,
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 4de031a9f9e83..65946f9e251dc 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -124,6 +124,7 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   ORT_ENFORCE(logits_processor >= 0,
               "logits_processor shall be a non-negative integer, got ", logits_processor);
 
+  std::cout << "Reading in temperature now!" << std::endl;
   auto* temperature_tensor = context->Input<Tensor>(14);
   if (temperature_tensor) {
     if (temperature_tensor->IsDataType<float>()) {
@@ -134,6 +135,7 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   } else {
     temperature = 1.0f;
   }
+  //std::cout << "Temperature Setasdasd!!" << temperature << std::endl;
   //auto* temperature_tensor = context->Input<Tensor>(14);
   //temperature = temperature_tensor ? *temperature_tensor->Data<float>() : 1.0f;
 }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index e01391c7fa4fd..9908842665d5d 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -215,6 +215,7 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
   std::cout << "here3" << std::endl;
   // Score of each sequence, with shape (batch_size * num_return_sequences).
   //gsl::span<float> sequence_scores;
+  // cast each element to FP16?
   gsl::span<float> sequence_scores;
   if (output_sequence_scores) {
     sequence_scores = output_sequence_scores->MutableDataAsSpan<float>();
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
index 927d3a58e5a6f..2a65187f81ea9 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
@@ -292,7 +292,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 #ifndef DEBUG_GENERATION
   ORT_UNUSED_PARAMETER(dumper);
 #endif
-
+  std::cout << "PRocessing Logits!" << std::endl;
   int batch_size = parameters->batch_size;
   int num_beams = parameters->num_beams;
   int vocab_size = parameters->vocab_size;
@@ -334,7 +334,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
     dumper->Print("next_token_logits", next_token_logits.data(), batch_size, num_beams, vocab_size);
   }
 #endif
-
+std::cout << "Next token score!" << std::endl;
   // Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1)
   gsl::span<T>& next_token_scores = beam_state->next_token_scores;
   ORT_RETURN_IF_ERROR(
@@ -349,9 +349,10 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 #ifdef DEBUG_GENERATION
   dumper->Print("next_token_scores after softmax", next_token_scores.data(), batch_size, num_beams, vocab_size);
 #endif
-
+std::cout << "Processors starting" << std::endl;
   // Apply all score processors that updates scores
   logits_processors->Process(sequences, next_token_scores, step);
+std::cout << "Processor ending" << std::endl;
 
 #ifdef DEBUG_GENERATION
   dumper->Print("next_token_scores after logits process", next_token_scores.data(), batch_size, num_beams, vocab_size);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
index f39f090c78b0c..0db1abf6da673 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
@@ -184,6 +184,8 @@ TemperatureLogitsProcessor<T>::TemperatureLogitsProcessor(float temperature) : t
 template <typename T>
 void TemperatureLogitsProcessor<T>::Process(const ISequences* /*sequences*/,
                                             NextTokenScores<T>& next_token_scores) {
+
+  std::cout << "Applying Temperature!" << std::endl;
   if (temperature_ == 1.0f) {
     return;
   }
@@ -193,7 +195,7 @@ void TemperatureLogitsProcessor<T>::Process(const ISequences* /*sequences*/,
     *p /= temperature_;
     ++p;
   }
-
+std::cout << "Applied temperature!" << std::endl;
 #ifdef DEBUG_GENERATION
   DumpScores("TemperatureLogitsProcessor", next_token_scores);
 #endif
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index 0f3bc25d0ea8e..590f4da0a8a79 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -8,6 +8,7 @@
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include "contrib_ops/cuda/bert/utils.cuh"
 #include "contrib_ops/cuda/transformers/generation_cuda_impl.h"
+#include <stdio.h>
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
@@ -500,13 +501,14 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
                                                                                   next_beam_tokens.data());
 }
 
+template <typename T>
 __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
                                           const int32_t* sequences_buffer,
                                           int sequence_length,
                                           BeamHypotheses* beam_hyps_,
                                           const float* final_beam_scores,
                                           int32_t* output,
-                                          float* sequence_scores) {
+                                          T* sequence_scores) {
   int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
   if (batch_index >= state.batch_size_)
     return;
@@ -533,6 +535,7 @@ __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
       sequence_scores ? sequence_scores + batch_index * state.num_return_sequences_ : nullptr);
 }
 
+template <typename T>
 void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      BeamScorerState& state,
                                      gsl::span<const int32_t> sequences,
@@ -540,7 +543,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      gsl::span<BeamHypotheses> beam_hyps,
                                      gsl::span<const float> final_beam_scores,
                                      gsl::span<int32_t> output,
-                                     gsl::span<float> sequence_scores,
+                                     gsl::span<T> sequence_scores,
                                      cudaStream_t stream) {
   BeamSearchScorer_Finalize<<<1, batch_size, 0, stream>>>(state,
                                                           sequences.data(),
@@ -551,6 +554,28 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                                           sequence_scores.data());
 }
 
+template void LaunchBeamSearchScorer_Finalize(int batch_size,
+                                     BeamScorerState& state,
+                                     gsl::span<const int32_t> sequences,
+                                     int sequence_length,
+                                     gsl::span<BeamHypotheses> beam_hyps,
+                                     gsl::span<const float> final_beam_scores,
+                                     gsl::span<int32_t> output,
+                                     gsl::span<float> sequence_scores,
+                                     cudaStream_t stream);
+
+/*
+template void LaunchBeamSearchScorer_Finalize(int batch_size,
+                                     BeamScorerState& state,
+                                     gsl::span<const int32_t> sequences,
+                                     int sequence_length,
+                                     gsl::span<BeamHypotheses> beam_hyps,
+                                     gsl::span<const float> final_beam_scores,
+                                     gsl::span<int32_t> output,
+                                     gsl::span<half> sequence_scores,
+                                     cudaStream_t stream);
+*/
+
 __global__ void AddProbsKernel(float* log_probs,
                                float* cum_log_probs,
                                const int vocab_size,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
index 5ed5949196b29..15624c77c197a 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
@@ -110,6 +110,7 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
                                                        gsl::span<int32_t> next_beam_indices,
                                                        cudaStream_t stream);
 
+template <typename T>
 void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      BeamScorerState& state,
                                      gsl::span<const int32_t> sequences,
@@ -117,7 +118,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      gsl::span<BeamHypotheses> beam_hyps_,
                                      gsl::span<const float> final_beam_scores,
                                      gsl::span<int32_t> output,
-                                     gsl::span<float> sequence_scores,
+                                     gsl::span<T> sequence_scores,
                                      cudaStream_t stream);
 
 void LaunchNextTokenKernel(const int64_t* next_token_indices,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index 380d561bbb23c..92ece98e909d9 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -20,6 +20,10 @@
 #include "contrib_ops/cuda/transformers/beam_search_topk.h"
 #include "contrib_ops/cuda/transformers/greedy_search_top_one.h"
 #include "core/providers/cuda/tensor/transpose.h"
+#include <stdio.h>
+#include <vector>
+#include <iterator>
+#include <typeinfo>
 
 // the includes would be dummy for ROCm, we will ignore them for now
 #ifdef ENABLE_NVTX_PROFILE
@@ -334,6 +338,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
                      Stream* ort_stream,                                     // cuda stream (for CUDA only)
                      const transformers::IConsoleDumper* dumper) {           // tensor dumper
 
+std::cout << "Processig logits!" << std::endl;
 #ifdef ENABLE_NVTX_PROFILE
   profile::NvtxNestedRangeCreator processLogitsRange("ProcessLogits", profile::Color::Red);
   processLogitsRange.Begin();
@@ -401,7 +406,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
     dumper->Print("next_token_logits", next_token_logits.data(), batch_size, num_beams, vocab_size);
   }
 #endif
-
+  std::cout << "Log softmax" << std::endl;
   // Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1)
   gsl::span<float>& next_token_scores = beam_state->next_token_scores;
 
@@ -431,6 +436,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   int extra_decoding_len = static_cast<int>(parameters->extra_decoding_ids.size() / parameters->batch_size);
   const bool need_handle_extra_decoding_ids = is_whisper_model && (!parameters->extra_decoding_ids.empty()) && (extra_decoding_len >= step);
 
+  std::cout << "Launcing processing kernel" << std::endl;
   cuda::LaunchLogitsProcessKernel<float>(
       next_token_scores.data(),
       parameters->vocab_mask.data(),
@@ -482,6 +488,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream));
   }
 
+  std::cout << "LaunchForceDecoding" << std::endl;
   if (need_handle_extra_decoding_ids && !parameters->extra_decoding_ids.empty()) {
     cuda::LaunchForceDecodingIds(
         next_token_scores.data(),
@@ -497,6 +504,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 #ifdef DEBUG_GENERATION
   dumper->Print("next_token_scores after logits process", next_token_scores.data(), batch_size, num_beams, vocab_size);
 #endif
+std::cout << "Addprobs to" << std::endl;
   // Add beam score to next token scores. Corresponding python code is like:
   //    next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
   cuda::LaunchAddProbsKernel(next_token_scores.data(), beam_state->beam_scores.data(),
@@ -743,21 +751,44 @@ bool CudaBeamSearchScorer::IsDoneLater() const {
   return state_cpu_->not_done_count_ == 0;
 }
 
+template <class T, class U>
+gsl::span<T>convert_span(gsl::span<U> s)
+{
+    auto data = s.data();
+    if (!data)
+    {
+        return {};
+    }
+    auto bytes = s.size_bytes();
+    Expects(bytes % sizeof(T) == 0);
+ 
+    return { reinterpret_cast<T*>(data), bytes / gsl::narrow_cast<int>(sizeof(T)) };
+}
+
 void CudaBeamSearchScorer::Finalize(transformers::ISequences& sequences,
                                     gsl::span<const float>& final_beam_scores,
                                     Tensor* output_sequences,
                                     Tensor* output_sequence_scores) {
+  std::cout << "HERE" << std::endl;
   ORT_ENFORCE(output_sequences != nullptr);
 
   // Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length).
+  std::cout << "HERE121212" << std::endl;
   gsl::span<int32_t> output{output_sequences->MutableData<int32_t>(), static_cast<size_t>(output_sequences->Shape().Size())};
 
   // Score of each sequence, with shape (batch_size * num_return_sequences).
   gsl::span<float> sequence_scores;
+  //static_cast<float>(*temperature_tensor->Data<MLFloat16>()
   if (output_sequence_scores) {
-    sequence_scores = gsl::span<float>{output_sequence_scores->MutableData<float>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
+    gsl::span<MLFloat16> old_span = gsl::span<MLFloat16>{output_sequence_scores->MutableData<MLFloat16>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
+    auto data = old_span.data();
+    auto bytes = old_span.size_bytes();
+    sequence_scores = gsl::span<float>{reinterpret_cast<float*>(data), bytes / gsl::narrow_cast<int>(sizeof(float))};
+    //sequence_scores = convert_span(old_span); //gsl::span<float>{output_sequence_scores->MutableData<float>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
   }
+  std::cout << "HERE3" << std::endl;
 
+  //gsl::span<float> new_scores = static_cast<float>()
   cuda::LaunchBeamSearchScorer_Finalize(state_cpu_->batch_size_, *state_gpu_, sequences.GetCurrentDeviceSequences(), sequences.GetSequenceLength(), beam_hyps_, final_beam_scores, output, sequence_scores, stream_);
 }
 

From 5694e80d8b0dabcae9221f622243860c844162bd Mon Sep 17 00:00:00 2001
From: Peter McAughan <petermca@microsoft.com>
Date: Wed, 17 Jan 2024 20:15:39 +0000
Subject: [PATCH 4/9] Revert "Debugging dump"

This reverts commit 676b6f078d0ba692a4ee141782b65ef18d134a26.
---
 .../cpu/transformers/beam_search_impl_base.h  |  5 ---
 .../transformers/beam_search_impl_whisper.h   | 15 +-------
 .../transformers/beam_search_parameters.cc    |  2 --
 .../cpu/transformers/beam_search_scorer.cc    |  1 -
 .../transformers/generation_device_helper.cc  |  7 ++--
 .../cpu/transformers/logits_processor.cc      |  4 +--
 .../cuda/transformers/generation_cuda_impl.cu | 29 ++-------------
 .../cuda/transformers/generation_cuda_impl.h  |  3 +-
 .../transformers/generation_device_helper.cc  | 35 ++-----------------
 9 files changed, 10 insertions(+), 91 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
index 9ca14a334cc0c..29b38fc234de5 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
@@ -247,7 +247,6 @@ Status BeamSearchBase<T>::ProcessLogits(
     BeamSearchCpuState& cpu_state,
     AllocatorPtr& allocator,
     int counter) {
-  std::cout << "Processing logits?" << std::endl;
   return process_logits_func_(logits, &beam_state, &(cpu_state.sequences), allocator,
                               thread_pool_, &logits_processors_, beam_scorer_.get(),
                               parameters_, counter, ort_stream_, GetConsoleDumper());
@@ -261,9 +260,7 @@ Status BeamSearchBase<T>::GenerateNextToken(
     BeamSearchCpuState& cpu_state,
     int counter) {
   // Process logits to get next token scores
-  std::cout << "Processing Logits!" << std::endl;
   ORT_RETURN_IF_ERROR(ProcessLogits(logits, beam_state, cpu_state, temp_space_allocator_, counter));
-  std::cout << "Getting next scores!" << std::endl;
 
   if (this->IsCuda()) {
     auto beam_scores = beam_scorer_->GetNextScores();
@@ -277,8 +274,6 @@ Status BeamSearchBase<T>::GenerateNextToken(
 
     beam_next_tokens = beam_scorer_->GetNextTokens();
 
-  std::cout << "Got next scores" << std::endl;
-
 #ifdef DEBUG_GENERATION
     auto beam_indices = beam_scorer_->GetNextIndicesGPU();
     cuda_dumper_->Print("beam_scores from scorer", beam_state.beam_scores.data(), parameters_->batch_size, parameters_->num_beams);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
index d2baefc9800c2..c9fbc5861583f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -114,7 +114,6 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
 
   const BeamSearchParameters* parameters = this->parameters_;
 
-  std::cout << "Executing Encoder" << std::endl;
   // Allocate output tensors.
   int64_t sequences_dims[] = {parameters->batch_size, parameters->num_return_sequences, parameters->max_length};
   TensorShape sequences_shape(&sequences_dims[0], sizeof(sequences_dims) / sizeof(sequences_dims[0]));
@@ -202,13 +201,10 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
   }
 #endif
 
-std::cout << "Done with encoder!" << std::endl;
-
   // ------------------------------------
   // Initialize resources
   // ------------------------------------
 
-std::cout << "Initializing beam_State" << std::endl;
   BeamSearchState<T> beam_state{*parameters,
                                 this->temp_space_allocator_,
                                 decoder_subgraph_.has_decoder_masked_attention_,
@@ -241,7 +237,6 @@ std::cout << "Initializing beam_State" << std::endl;
   // ------------------------------------------------------------------------------
   // Generate next token from logits output from encoder, and initialize decoder inputs.
   // ------------------------------------------------------------------------------
-  std::cout << "Initialized beam_State!" << std::endl;
   gsl::span<int32_t> beam_next_tokens;
 
   int iteration_counter = 0;
@@ -260,8 +255,6 @@ std::cout << "Initializing beam_State" << std::endl;
 
   std::vector<OrtValue> decoder_fetches;
 
-  std::cout << "Executing search!" << std::endl;
-
   if (current_length + 1 < parameters->max_length) {
     ++iteration_counter;
     ORT_RETURN_IF_ERROR(this->GenerateNextToken(encoder_fetches[0],
@@ -270,7 +263,7 @@ std::cout << "Initializing beam_State" << std::endl;
                                                 cpu_state,
                                                 iteration_counter));
     ++current_length;  // Increase sequence length after a new token is generated.
-    std::cout << "1" << std::endl;
+
     ORT_RETURN_IF_ERROR(decoder_subgraph_.CreateInitialFeeds(this->cpu_allocator_,
                                                              ReinterpretAsSpan<const int32_t>(beam_next_tokens),
                                                              this->implicit_inputs_,
@@ -287,7 +280,6 @@ std::cout << "Initializing beam_State" << std::endl;
                                                              cpu_state.sequences,
                                                              parameters->max_length,
                                                              decoder_subgraph_.has_decoder_masked_attention_));
-    std::cout << "2" << std::endl;
 
     if (decoder_subgraph_.past_present_share_buffer_) {
       decoder_fetches.reserve(static_cast<size_t>(decoder_subgraph_.GetFirstPresentOutputIndex()) +
@@ -304,10 +296,7 @@ std::cout << "Initializing beam_State" << std::endl;
       }
     }
 
-    std::cout << "3" << std::endl;
-
     if (decoder_subgraph_.output_cross_qk_) {
-      std::cout << "QK" << std::endl;
       ORT_ENFORCE(decoder_subgraph_.has_decoder_masked_attention_, "decoder subgraph: output_cross_qk could only work with has_decoder_masked_attention");
       ORT_ENFORCE(decoder_subgraph_.past_present_share_buffer_, "decoder subgraph: output_cross_qk could only work with past_present_share_buffer");
 
@@ -510,8 +499,6 @@ std::cout << "Initializing beam_State" << std::endl;
   std::cout << "final_beam_scores" << std::endl;
   std::cout << "float?" << output_sequences_scores->IsDataType<float>()<< std::endl;
   std::cout << "float16?" << output_sequences_scores->IsDataType<MLFloat16>() << std::endl;
-  std::cout << "Output Scores?" << output_scores << std::endl;
-  
   this->beam_scorer_->Finalize(cpu_state.sequences,
                                final_beam_scores,
                                output_sequences,
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 65946f9e251dc..4de031a9f9e83 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -124,7 +124,6 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   ORT_ENFORCE(logits_processor >= 0,
               "logits_processor shall be a non-negative integer, got ", logits_processor);
 
-  std::cout << "Reading in temperature now!" << std::endl;
   auto* temperature_tensor = context->Input<Tensor>(14);
   if (temperature_tensor) {
     if (temperature_tensor->IsDataType<float>()) {
@@ -135,7 +134,6 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   } else {
     temperature = 1.0f;
   }
-  //std::cout << "Temperature Setasdasd!!" << temperature << std::endl;
   //auto* temperature_tensor = context->Input<Tensor>(14);
   //temperature = temperature_tensor ? *temperature_tensor->Data<float>() : 1.0f;
 }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index 9908842665d5d..e01391c7fa4fd 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -215,7 +215,6 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
   std::cout << "here3" << std::endl;
   // Score of each sequence, with shape (batch_size * num_return_sequences).
   //gsl::span<float> sequence_scores;
-  // cast each element to FP16?
   gsl::span<float> sequence_scores;
   if (output_sequence_scores) {
     sequence_scores = output_sequence_scores->MutableDataAsSpan<float>();
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
index 2a65187f81ea9..927d3a58e5a6f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
@@ -292,7 +292,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
 #ifndef DEBUG_GENERATION
   ORT_UNUSED_PARAMETER(dumper);
 #endif
-  std::cout << "PRocessing Logits!" << std::endl;
+
   int batch_size = parameters->batch_size;
   int num_beams = parameters->num_beams;
   int vocab_size = parameters->vocab_size;
@@ -334,7 +334,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
     dumper->Print("next_token_logits", next_token_logits.data(), batch_size, num_beams, vocab_size);
   }
 #endif
-std::cout << "Next token score!" << std::endl;
+
   // Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1)
   gsl::span<T>& next_token_scores = beam_state->next_token_scores;
   ORT_RETURN_IF_ERROR(
@@ -349,10 +349,9 @@ std::cout << "Next token score!" << std::endl;
 #ifdef DEBUG_GENERATION
   dumper->Print("next_token_scores after softmax", next_token_scores.data(), batch_size, num_beams, vocab_size);
 #endif
-std::cout << "Processors starting" << std::endl;
+
   // Apply all score processors that updates scores
   logits_processors->Process(sequences, next_token_scores, step);
-std::cout << "Processor ending" << std::endl;
 
 #ifdef DEBUG_GENERATION
   dumper->Print("next_token_scores after logits process", next_token_scores.data(), batch_size, num_beams, vocab_size);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
index 0db1abf6da673..f39f090c78b0c 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.cc
@@ -184,8 +184,6 @@ TemperatureLogitsProcessor<T>::TemperatureLogitsProcessor(float temperature) : t
 template <typename T>
 void TemperatureLogitsProcessor<T>::Process(const ISequences* /*sequences*/,
                                             NextTokenScores<T>& next_token_scores) {
-
-  std::cout << "Applying Temperature!" << std::endl;
   if (temperature_ == 1.0f) {
     return;
   }
@@ -195,7 +193,7 @@ void TemperatureLogitsProcessor<T>::Process(const ISequences* /*sequences*/,
     *p /= temperature_;
     ++p;
   }
-std::cout << "Applied temperature!" << std::endl;
+
 #ifdef DEBUG_GENERATION
   DumpScores("TemperatureLogitsProcessor", next_token_scores);
 #endif
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index 590f4da0a8a79..0f3bc25d0ea8e 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -8,7 +8,6 @@
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include "contrib_ops/cuda/bert/utils.cuh"
 #include "contrib_ops/cuda/transformers/generation_cuda_impl.h"
-#include <stdio.h>
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
@@ -501,14 +500,13 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
                                                                                   next_beam_tokens.data());
 }
 
-template <typename T>
 __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
                                           const int32_t* sequences_buffer,
                                           int sequence_length,
                                           BeamHypotheses* beam_hyps_,
                                           const float* final_beam_scores,
                                           int32_t* output,
-                                          T* sequence_scores) {
+                                          float* sequence_scores) {
   int batch_index = blockIdx.x * blockDim.x + threadIdx.x;
   if (batch_index >= state.batch_size_)
     return;
@@ -535,7 +533,6 @@ __global__ void BeamSearchScorer_Finalize(BeamScorerState& state,
       sequence_scores ? sequence_scores + batch_index * state.num_return_sequences_ : nullptr);
 }
 
-template <typename T>
 void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      BeamScorerState& state,
                                      gsl::span<const int32_t> sequences,
@@ -543,7 +540,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      gsl::span<BeamHypotheses> beam_hyps,
                                      gsl::span<const float> final_beam_scores,
                                      gsl::span<int32_t> output,
-                                     gsl::span<T> sequence_scores,
+                                     gsl::span<float> sequence_scores,
                                      cudaStream_t stream) {
   BeamSearchScorer_Finalize<<<1, batch_size, 0, stream>>>(state,
                                                           sequences.data(),
@@ -554,28 +551,6 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                                           sequence_scores.data());
 }
 
-template void LaunchBeamSearchScorer_Finalize(int batch_size,
-                                     BeamScorerState& state,
-                                     gsl::span<const int32_t> sequences,
-                                     int sequence_length,
-                                     gsl::span<BeamHypotheses> beam_hyps,
-                                     gsl::span<const float> final_beam_scores,
-                                     gsl::span<int32_t> output,
-                                     gsl::span<float> sequence_scores,
-                                     cudaStream_t stream);
-
-/*
-template void LaunchBeamSearchScorer_Finalize(int batch_size,
-                                     BeamScorerState& state,
-                                     gsl::span<const int32_t> sequences,
-                                     int sequence_length,
-                                     gsl::span<BeamHypotheses> beam_hyps,
-                                     gsl::span<const float> final_beam_scores,
-                                     gsl::span<int32_t> output,
-                                     gsl::span<half> sequence_scores,
-                                     cudaStream_t stream);
-*/
-
 __global__ void AddProbsKernel(float* log_probs,
                                float* cum_log_probs,
                                const int vocab_size,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
index 15624c77c197a..5ed5949196b29 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
@@ -110,7 +110,6 @@ void LaunchBeamSearchScorer_AppendNextTokenToSequences(BeamScorerState& state_cp
                                                        gsl::span<int32_t> next_beam_indices,
                                                        cudaStream_t stream);
 
-template <typename T>
 void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      BeamScorerState& state,
                                      gsl::span<const int32_t> sequences,
@@ -118,7 +117,7 @@ void LaunchBeamSearchScorer_Finalize(int batch_size,
                                      gsl::span<BeamHypotheses> beam_hyps_,
                                      gsl::span<const float> final_beam_scores,
                                      gsl::span<int32_t> output,
-                                     gsl::span<T> sequence_scores,
+                                     gsl::span<float> sequence_scores,
                                      cudaStream_t stream);
 
 void LaunchNextTokenKernel(const int64_t* next_token_indices,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index 92ece98e909d9..380d561bbb23c 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -20,10 +20,6 @@
 #include "contrib_ops/cuda/transformers/beam_search_topk.h"
 #include "contrib_ops/cuda/transformers/greedy_search_top_one.h"
 #include "core/providers/cuda/tensor/transpose.h"
-#include <stdio.h>
-#include <vector>
-#include <iterator>
-#include <typeinfo>
 
 // the includes would be dummy for ROCm, we will ignore them for now
 #ifdef ENABLE_NVTX_PROFILE
@@ -338,7 +334,6 @@ Status ProcessLogits(const OrtValue& logits,                                 //
                      Stream* ort_stream,                                     // cuda stream (for CUDA only)
                      const transformers::IConsoleDumper* dumper) {           // tensor dumper
 
-std::cout << "Processig logits!" << std::endl;
 #ifdef ENABLE_NVTX_PROFILE
   profile::NvtxNestedRangeCreator processLogitsRange("ProcessLogits", profile::Color::Red);
   processLogitsRange.Begin();
@@ -406,7 +401,7 @@ std::cout << "Processig logits!" << std::endl;
     dumper->Print("next_token_logits", next_token_logits.data(), batch_size, num_beams, vocab_size);
   }
 #endif
-  std::cout << "Log softmax" << std::endl;
+
   // Get scores for candidates of next token: next_token_scores = log_softmax(next_token_logits, dim=-1)
   gsl::span<float>& next_token_scores = beam_state->next_token_scores;
 
@@ -436,7 +431,6 @@ std::cout << "Processig logits!" << std::endl;
   int extra_decoding_len = static_cast<int>(parameters->extra_decoding_ids.size() / parameters->batch_size);
   const bool need_handle_extra_decoding_ids = is_whisper_model && (!parameters->extra_decoding_ids.empty()) && (extra_decoding_len >= step);
 
-  std::cout << "Launcing processing kernel" << std::endl;
   cuda::LaunchLogitsProcessKernel<float>(
       next_token_scores.data(),
       parameters->vocab_mask.data(),
@@ -488,7 +482,6 @@ std::cout << "Processig logits!" << std::endl;
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(cuda_stream));
   }
 
-  std::cout << "LaunchForceDecoding" << std::endl;
   if (need_handle_extra_decoding_ids && !parameters->extra_decoding_ids.empty()) {
     cuda::LaunchForceDecodingIds(
         next_token_scores.data(),
@@ -504,7 +497,6 @@ std::cout << "Processig logits!" << std::endl;
 #ifdef DEBUG_GENERATION
   dumper->Print("next_token_scores after logits process", next_token_scores.data(), batch_size, num_beams, vocab_size);
 #endif
-std::cout << "Addprobs to" << std::endl;
   // Add beam score to next token scores. Corresponding python code is like:
   //    next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
   cuda::LaunchAddProbsKernel(next_token_scores.data(), beam_state->beam_scores.data(),
@@ -751,44 +743,21 @@ bool CudaBeamSearchScorer::IsDoneLater() const {
   return state_cpu_->not_done_count_ == 0;
 }
 
-template <class T, class U>
-gsl::span<T>convert_span(gsl::span<U> s)
-{
-    auto data = s.data();
-    if (!data)
-    {
-        return {};
-    }
-    auto bytes = s.size_bytes();
-    Expects(bytes % sizeof(T) == 0);
- 
-    return { reinterpret_cast<T*>(data), bytes / gsl::narrow_cast<int>(sizeof(T)) };
-}
-
 void CudaBeamSearchScorer::Finalize(transformers::ISequences& sequences,
                                     gsl::span<const float>& final_beam_scores,
                                     Tensor* output_sequences,
                                     Tensor* output_sequence_scores) {
-  std::cout << "HERE" << std::endl;
   ORT_ENFORCE(output_sequences != nullptr);
 
   // Word IDs of each sequence, with shape (batch_size * num_return_sequences, max_sequence_length).
-  std::cout << "HERE121212" << std::endl;
   gsl::span<int32_t> output{output_sequences->MutableData<int32_t>(), static_cast<size_t>(output_sequences->Shape().Size())};
 
   // Score of each sequence, with shape (batch_size * num_return_sequences).
   gsl::span<float> sequence_scores;
-  //static_cast<float>(*temperature_tensor->Data<MLFloat16>()
   if (output_sequence_scores) {
-    gsl::span<MLFloat16> old_span = gsl::span<MLFloat16>{output_sequence_scores->MutableData<MLFloat16>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
-    auto data = old_span.data();
-    auto bytes = old_span.size_bytes();
-    sequence_scores = gsl::span<float>{reinterpret_cast<float*>(data), bytes / gsl::narrow_cast<int>(sizeof(float))};
-    //sequence_scores = convert_span(old_span); //gsl::span<float>{output_sequence_scores->MutableData<float>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
+    sequence_scores = gsl::span<float>{output_sequence_scores->MutableData<float>(), static_cast<size_t>(output_sequence_scores->Shape().Size())};
   }
-  std::cout << "HERE3" << std::endl;
 
-  //gsl::span<float> new_scores = static_cast<float>()
   cuda::LaunchBeamSearchScorer_Finalize(state_cpu_->batch_size_, *state_gpu_, sequences.GetCurrentDeviceSequences(), sequences.GetSequenceLength(), beam_hyps_, final_beam_scores, output, sequence_scores, stream_);
 }
 

From 1380a15bc3ab4ac144e1b17464bbc13c54aaa12b Mon Sep 17 00:00:00 2001
From: Peter McAughan <petermca@microsoft.com>
Date: Wed, 17 Jan 2024 20:15:55 +0000
Subject: [PATCH 5/9] Revert "Debugging"

This reverts commit d989904cc9009c13de888cd28b1ed58d6389e747.
---
 include/onnxruntime/core/framework/tensor.h            |  8 ++++----
 .../cpu/transformers/beam_search_impl_whisper.h        |  8 --------
 .../cpu/transformers/beam_search_parameters.cc         |  3 +--
 .../contrib_ops/cpu/transformers/beam_search_scorer.cc | 10 +---------
 .../contrib_ops/cpu/transformers/logits_processor.h    |  2 +-
 .../cuda/transformers/generation_cuda_impl.cu          |  1 +
 onnxruntime/core/graph/contrib_ops/contrib_defs.cc     |  7 +++----
 onnxruntime/core/session/inference_session.cc          |  1 -
 8 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index c4e773ebe8f6b..a867ab6066485 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -186,8 +186,8 @@ class Tensor final {
   template <typename T>
   T* MutableData() {
     // Type check
-    //ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
-    //           "T ", "!=", dtype_);
+    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+                "T ", "!=", dtype_);
     return reinterpret_cast<T*>(static_cast<char*>(p_data_) + byte_offset_);
   }
 
@@ -197,8 +197,8 @@ class Tensor final {
   template <typename T>
   gsl::span<T> MutableDataAsSpan() {
     // Type check
-    //ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
-     //           "T ", "!=", dtype_);
+    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+                "T ", "!=", dtype_);
     T* data = reinterpret_cast<T*>(static_cast<char*>(p_data_) + byte_offset_);
     return gsl::make_span(data, static_cast<size_t>(shape_.Size()));
   }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
index c9fbc5861583f..91b93a125ad7a 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_whisper.h
@@ -464,7 +464,6 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     }
   }
 
-  std::cout << "Done with search!" << std::endl;
   if (decoder_subgraph_.output_cross_qk_) {
     TensorShape cross_qk_shape{
         static_cast<int64_t>(parameters->batch_size),
@@ -494,18 +493,12 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
         beam_indices));
   }
 
-  std::cout << "Done with QK!..." << std::endl;
   gsl::span<const float> final_beam_scores = beam_state.beam_scores;
-  std::cout << "final_beam_scores" << std::endl;
-  std::cout << "float?" << output_sequences_scores->IsDataType<float>()<< std::endl;
-  std::cout << "float16?" << output_sequences_scores->IsDataType<MLFloat16>() << std::endl;
   this->beam_scorer_->Finalize(cpu_state.sequences,
                                final_beam_scores,
                                output_sequences,
                                output_sequences_scores);
 
-  std::cout << "Outputting scores..." << std::endl;
-
   // Output per token scores
   if (output_scores) {
     gsl::span<float> target = output_scores->MutableDataAsSpan<float>();
@@ -514,7 +507,6 @@ Status BeamSearchWhisper<T>::Execute(const FeedsFetchesManager& encoder_feeds_fe
     ORT_RETURN_IF_ERROR(this->device_copy_func_(target, source, nullptr, DeviceCopyDirection::deviceToDevice));
   }
 
-  std::cout << "Outputting scores!" << std::endl;
   return status;
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 4de031a9f9e83..6e6a3e616e6cc 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -125,6 +125,7 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
               "logits_processor shall be a non-negative integer, got ", logits_processor);
 
   auto* temperature_tensor = context->Input<Tensor>(14);
+  auto* temperature_tensor = context->Input<Tensor>(6);
   if (temperature_tensor) {
     if (temperature_tensor->IsDataType<float>()) {
       temperature = *temperature_tensor->Data<float>();
@@ -134,8 +135,6 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   } else {
     temperature = 1.0f;
   }
-  //auto* temperature_tensor = context->Input<Tensor>(14);
-  //temperature = temperature_tensor ? *temperature_tensor->Data<float>() : 1.0f;
 }
 void BeamSearchParameters::SetSubgraphParameters(int vocabulary_size, int heads, int hidden_size_per_head, int layers) {
   // Override vocab_size using the inferred shape from the decoder subgraph ONLY IF
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
index e01391c7fa4fd..7e2e5b2129221 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_scorer.cc
@@ -13,8 +13,6 @@
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/cpu/rnn/rnn_helpers.h"
 #include "contrib_ops/cpu/transformers/beam_search_scorer.h"
-#include <stdio.h>
-#include <iostream>
 
 namespace onnxruntime {
 namespace contrib {
@@ -189,15 +187,13 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
                                 Tensor* output_sequence_scores) {
   ORT_ENFORCE(output_sequences != nullptr);
 
-
   // Finalize all open beam hypotheses and add to generated hypotheses.
-  std::cout << "here1" << std::endl;
   for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
     BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
     if (beam_hyp.done_) {
       continue;
     }
-    std::cout << "here2" << std::endl;
+
     for (size_t beam_index = 0; beam_index < num_beams_; beam_index++) {
       size_t batch_beam_index = batch_index * num_beams_ + beam_index;
       float final_score = final_beam_scores[batch_beam_index];
@@ -212,15 +208,12 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
   // Fill output sequences with pad token ID so that we do not need append it later.
   std::fill_n(output.data(), output.size(), pad_token_id_);
 
-  std::cout << "here3" << std::endl;
   // Score of each sequence, with shape (batch_size * num_return_sequences).
-  //gsl::span<float> sequence_scores;
   gsl::span<float> sequence_scores;
   if (output_sequence_scores) {
     sequence_scores = output_sequence_scores->MutableDataAsSpan<float>();
   }
 
-  std::cout << "here4" << std::endl;
   // Select the best hypotheses according to number of sequences to return.
   for (size_t batch_index = 0; batch_index < batch_size_; batch_index++) {
     BeamHypotheses& beam_hyp = beam_hyps_[batch_index];
@@ -234,7 +227,6 @@ void BeamSearchScorer::Finalize(ISequences& sequences,
     beam_hyp.Output(narrow<int>(num_return_sequences_), narrow<int>(max_length_), batch_output,
                     sequence_scores_buffer);
   }
-  std::cout << "here5" << std::endl;
 }
 
 }  // namespace transformers
diff --git a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
index 43a449520dfaa..4688ff272cee9 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/logits_processor.h
@@ -10,7 +10,7 @@
 #include "contrib_ops/cpu/transformers/greedy_search_parameters.h"
 #include "contrib_ops/cpu/transformers/sampling_parameters.h"
 #include "contrib_ops/cpu/transformers/generation_shared.h"
-#include <stdio.h>
+
 namespace onnxruntime {
 namespace contrib {
 namespace transformers {
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index 0f3bc25d0ea8e..dbd7fb010462d 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -8,6 +8,7 @@
 #include <cub/device/device_segmented_radix_sort.cuh>
 #include "contrib_ops/cuda/bert/utils.cuh"
 #include "contrib_ops/cuda/transformers/generation_cuda_impl.h"
+
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 6cdc759eb1641..49bb657f68cc0 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1231,14 +1231,14 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                        "In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) "
                                        "are treated as stop of the extra_decoding_ids for corresponding batch.",
                                        "I", OpSchema::Optional)
-				 .Input(14, "temperature", "Temperature value to apply to logit processing during this execution's decoding. Shape is (1)", "V", OpSchema::Optional)
+				                        .Input(14, "temperature", "Temperature value to apply to logit processing during this execution's decoding. Shape is (1)", "V", OpSchema::Optional)
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, num_return_sequences, max_sequence_length)", "I")
-                                .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "L", OpSchema::Optional)
+                                .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "T", OpSchema::Optional)
                                 .Output(2, "scores",
                                         "Processed beam scores for each vocabulary token at each generation step."
                                         "Beam scores consisting of log softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam."
                                         "Shape is (max_length - sequence_length, batch_size, num_beams, vocab_size)",
-                                        "L", OpSchema::Optional)
+                                        "T", OpSchema::Optional)
                                 .Output(3, "cross_qk",
                                         "Output the accumulated stacked Q*K in cross attentions. Let H = number of Head of cross attention, "
                                         "F = the frames or kv-seq-len of the cross attention input, T = real decoded token length, L = number of layers,"
@@ -1251,7 +1251,6 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                         "The prob is save before logits may be updated by extra-decoding-ids. The shape of non_speech_probs is [B]",
                                         "T", OpSchema::Optional)
                                 .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain to float tensors.")
-                                .TypeConstraint("L", {"tensor(float)", "tensor(float16)"}, "Constrain to float tensors.")
                                 .TypeConstraint("F", {"tensor(float)", "tensor(int32)", "tensor(float16)"}, "Constrain input type to float or int tensors.")
                                 .TypeConstraint("I", {"tensor(int32)"}, "Constrain to integer types")
                                 .TypeConstraint("M", {"tensor(int32)"}, "Constrain mask to integer types")
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 240cad213fbca..1163be27b1685 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -1911,7 +1911,6 @@ const DataTransferManager& InferenceSession::GetDataTransferManager() const {
 
 common::Status InferenceSession::CheckShapes(const std::string& input_output_name, const TensorShape& input_output_shape,
                                              const TensorShape& expected_shape, const char* input_output_moniker) const {
-
   const auto shape_size = input_output_shape.NumDimensions();
   const auto expected_shape_size = expected_shape.NumDimensions();
   if (shape_size != expected_shape_size) {

From a1b1496384b5d5f2c09c0522aef4ff4ff52e5ec8 Mon Sep 17 00:00:00 2001
From: Peter McAughan <petermca@microsoft.com>
Date: Wed, 17 Jan 2024 20:19:37 +0000
Subject: [PATCH 6/9] Typo fix

---
 .../contrib_ops/cpu/transformers/beam_search_parameters.cc       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 6e6a3e616e6cc..263025727ab9b 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -125,7 +125,6 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
               "logits_processor shall be a non-negative integer, got ", logits_processor);
 
   auto* temperature_tensor = context->Input<Tensor>(14);
-  auto* temperature_tensor = context->Input<Tensor>(6);
   if (temperature_tensor) {
     if (temperature_tensor->IsDataType<float>()) {
       temperature = *temperature_tensor->Data<float>();

From baf9349cf8e87903417b0a0fc011ecaa1bb5ec40 Mon Sep 17 00:00:00 2001
From: petermcaughan <peter.mcaughan@gmail.com>
Date: Wed, 17 Jan 2024 12:47:06 -0800
Subject: [PATCH 7/9] Update onnxruntime/core/graph/contrib_ops/contrib_defs.cc

Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
---
 onnxruntime/core/graph/contrib_ops/contrib_defs.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 49bb657f68cc0..6a4b5d4869113 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1231,7 +1231,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(WhisperBeamSearch, 1,
                                        "In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) "
                                        "are treated as stop of the extra_decoding_ids for corresponding batch.",
                                        "I", OpSchema::Optional)
-				                        .Input(14, "temperature", "Temperature value to apply to logit processing during this execution's decoding. Shape is (1)", "V", OpSchema::Optional)
+                                .Input(14, "temperature", "Temperature value to apply to logits processing during this execution's decoding. Shape is (1)", "T", OpSchema::Optional)
                                 .Output(0, "sequences", "Word IDs of generated sequences. Shape is (batch_size, num_return_sequences, max_sequence_length)", "I")
                                 .Output(1, "sequences_scores", "Final beam score of the generated sequences. Shape is (batch_size, num_return_sequences)", "T", OpSchema::Optional)
                                 .Output(2, "scores",

From 82ea9fa1db363672354591548849d9c33c33dfd8 Mon Sep 17 00:00:00 2001
From: Kunal Vaishnavi <kvaishnavi@microsoft.com>
Date: Fri, 19 Jan 2024 07:22:40 +0000
Subject: [PATCH 8/9] Check temperature input only for Whisper

---
 .../cpu/transformers/beam_search_parameters.cc   | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
index 263025727ab9b..bb6885c3216bc 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_parameters.cc
@@ -124,15 +124,17 @@ void BeamSearchParameters::ParseFromInputs(OpKernelContext* context) {
   ORT_ENFORCE(logits_processor >= 0,
               "logits_processor shall be a non-negative integer, got ", logits_processor);
 
-  auto* temperature_tensor = context->Input<Tensor>(14);
-  if (temperature_tensor) {
-    if (temperature_tensor->IsDataType<float>()) {
-      temperature = *temperature_tensor->Data<float>();
+  if (this->model_type == IGenerationParameters::kModelTypeWhisper) {
+    auto* temperature_tensor = context->Input<Tensor>(14);
+    if (temperature_tensor) {
+      if (temperature_tensor->IsDataType<float>()) {
+        temperature = *temperature_tensor->Data<float>();
+      } else {
+        temperature = static_cast<float>(*temperature_tensor->Data<MLFloat16>());
+      }
     } else {
-      temperature = static_cast<float>(*temperature_tensor->Data<MLFloat16>());
+      temperature = 1.0f;
     }
-  } else {
-    temperature = 1.0f;
   }
 }
 void BeamSearchParameters::SetSubgraphParameters(int vocabulary_size, int heads, int hidden_size_per_head, int layers) {

From 9977e1f2f66af42fcb61df89e00cf8836d0932bd Mon Sep 17 00:00:00 2001
From: Kunal Vaishnavi <kvaishnavi@microsoft.com>
Date: Fri, 19 Jan 2024 22:45:51 +0000
Subject: [PATCH 9/9] Update docs

---
 docs/ContribOperators.md | 4 +++-
 docs/OperatorKernels.md  | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 45c0e6f822ce9..eaa2160c83735 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -5755,7 +5755,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Size of the vocabulary. If not provided, it will be inferred from the decoder subgraph's output shape</dd>
 </dl>
 
-#### Inputs (5 - 14)
+#### Inputs (5 - 15)
 
 <dl>
 <dt><tt>input_ids</tt> : F</dt>
@@ -5786,6 +5786,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Only keep this list of (layer, head) of QK in the final cross_qk output when use_cross_qk is set. Default collect allits shape is (number of (layer, head) to keep, 2), i.e., [[layer_id1, head_id1], [layer_id2, head_id2]......]</dd>
 <dt><tt>extra_decoding_ids</tt> (optional) : I</dt>
 <dd>Part of the decoder_input_ids that we need cross qk for it. it is of shape  (batch_size, extra_decoding_ids_len).In such case, we should remove this from the tail of the decoder_input_ids, and put it here. ids < 0 in it (for multiple batch) are treated as stop of the extra_decoding_ids for corresponding batch.</dd>
+<dt><tt>temperature</tt> (optional) : T</dt>
+<dd>Temperature value to apply to logits processing during this execution's decoding. Shape is (1)</dd>
 </dl>
 
 #### Outputs (1 - 5)
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 394bd7ad2abae..d6322de18e7ef 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -499,7 +499,7 @@ Do not modify directly.*
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |Trilu|*in* X:**T**<br> *in* k:**tensor(int64)**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int64)|
 |Unique|*in* x:**T**<br> *out* y:**T**<br> *out* idx:**tensor(int64)**<br> *out* counts:**tensor(int64)**|1+|**T** = tensor(float)|
-|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float)|
+|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *in* temperature:**T**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float)|
 |WordConvEmbedding|*in* Sequence:**T**<br> *in* W:**T1**<br> *in* B:**T1**<br> *in* C:**T1**<br> *out* Y:**T1**|1+|**T** = tensor(int32)<br/> **T1** = tensor(float)|
 | |
 | |
@@ -876,7 +876,7 @@ Do not modify directly.*
 |TransposeMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |Trilu|*in* X:**T**<br> *in* k:**tensor(int64)**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |UnfoldTensor|*in* input:**T**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
+|WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *in* temperature:**T**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
 | |
 | |