From 43cd5e335916e67f627d4e47306a425563dae5d4 Mon Sep 17 00:00:00 2001
From: Ben Epstein <ben.epstein97@gmail.com>
Date: Thu, 2 May 2024 18:16:35 -0400
Subject: [PATCH 1/8] Update phi-3-tutorial.md (#378)

relates to https://github.com/microsoft/onnxruntime-genai/issues/376

---------

Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
---
 examples/python/phi-3-tutorial.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index aed02ea42..e1b4456ab 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -9,6 +9,12 @@
 
 Download either or both of the [short](https://aka.ms/phi3-mini-4k-instruct-onnx) and [long](https://aka.ms/phi3-mini-128k-instruct-onnx) context Phi-3 mini models from Hugging Face.
 
+To download the Phi-3 mini models, you will need to have git-lfs installed.
+* MacOS: `brew install git-lfs`
+* Linux: `apt-get install git-lfs`
+* Windows: `winget install -e --id GitHub.GitLFS` (If you don't have winget, download and run the `exe` from the [official source](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage?platform=windows))
+
+Then run `git lfs install`
 
 For the short context model.
 

From edfee65e47cde1f332a0b5be112274d0c95a6432 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Thu, 2 May 2024 16:49:45 -0700
Subject: [PATCH 2/8] Model generate template (#389)

---
 examples/python/model-generate.py | 52 +++++++++++++++++++++----------
 examples/python/model-qa.py       | 11 +++++--
 examples/python/phi3-qa.py        |  6 ++++
 3 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/examples/python/model-generate.py b/examples/python/model-generate.py
index 9b5af011a..3e2b38d90 100644
--- a/examples/python/model-generate.py
+++ b/examples/python/model-generate.py
@@ -13,15 +13,30 @@ def main(args):
         prompts = args.prompts
     else:
         prompts = ["I like walking my cute dog",
-                "What is the best restaurant in town?",
-                "Hello, how are you today?"]
+                   "What is the best restaurant in town?",
+                   "Hello, how are you today?"]
+    
+    if args.chat_template:
+        if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
+            print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
+            exit(1)
+        prompts[:] = [f'{args.chat_template.format(input=text)}' for text in prompts]
+        
     input_tokens = tokenizer.encode_batch(prompts)
-    if args.verbose: print("Prompt(s) encoded")
+    if args.verbose: print(f'Prompt(s) encoded: {prompts}')
 
     params = og.GeneratorParams(model)
-    params.set_search_options(max_length=args.max_length, top_p=args.top_p, top_k=args.top_k, temperature=args.temperature, repetition_penalty=args.repetition_penalty)
-    if args.cuda_graph_with_max_batch_size > 0:
-        params.try_use_cuda_graph_with_max_batch_size(args.cuda_graph_with_max_batch_size)
+
+    search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args} 
+
+    if (args.verbose): print(f'Args: {args}')
+    if (args.verbose): print(f'Search options: {search_options}')
+
+    params.set_search_options(**search_options)
+    # Set the batch size for the CUDA graph to the number of prompts if the user didn't specify a batch size
+    params.try_use_cuda_graph_with_max_batch_size(len(prompts))
+    if args.batch_size_for_cuda_graph:
+        params.try_use_cuda_graph_with_max_batch_size(args.batch_size_for_cuda_graph)
     params.input_ids = input_tokens
     if args.verbose: print("GeneratorParams created")
 
@@ -37,19 +52,24 @@ def main(args):
         print()
 
     print()
-    print(f"Tokens: {len(output_tokens[0])} Time: {run_time:.2f} Tokens per second: {len(output_tokens[0])/run_time:.2f}")
+    total_tokens = sum(len(x) for x in output_tokens)
+    print(f"Tokens: {total_tokens} Time: {run_time:.2f} Tokens per second: {total_tokens/run_time:.2f}")
     print()
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="End-to-end token generation loop example for gen-ai")
+    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description="End-to-end token generation loop example for gen-ai")
     parser.add_argument('-m', '--model', type=str, required=True, help='Onnx model folder path (must contain config.json and model.onnx)')
-    parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from')
-    parser.add_argument('-l', '--max_length', type=int, default=512, help='Max number of tokens to generate after prompt')
-    parser.add_argument('-p', '--top_p', type=float, default=0.9, help='Top p probability to sample with')
-    parser.add_argument('-k', '--top_k', type=int, default=50, help='Top k tokens to sample from')
-    parser.add_argument('-t', '--temperature', type=float, default=1.0, help='Temperature to sample with')
-    parser.add_argument('-r', '--repetition_penalty', type=float, default=1.0, help='Repetition penalty to sample with')
-    parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
-    parser.add_argument('-c', '--cuda_graph_with_max_batch_size', type=int, default=0, help='Max batch size for CUDA graph')
+    parser.add_argument('-pr', '--prompts', nargs='*', required=False, help='Input prompts to generate tokens from. Provide this parameter multiple times to batch multiple prompts')
+    parser.add_argument('-i', '--min_length', type=int, help='Min number of tokens to generate including the prompt')
+    parser.add_argument('-l', '--max_length', type=int, help='Max number of tokens to generate including the prompt')
+    parser.add_argument('-ds', '--do_random_sampling', action='store_true', help='Do random sampling. When false, greedy or beam search are used to generate the output. Defaults to false')
+    parser.add_argument('-p', '--top_p', type=float, help='Top p probability to sample with')
+    parser.add_argument('-k', '--top_k', type=int, help='Top k tokens to sample from')
+    parser.add_argument('-t', '--temperature', type=float, help='Temperature to sample with')
+    parser.add_argument('-r', '--repetition_penalty', type=float, help='Repetition penalty to sample with')
+    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Print verbose output and timing information. Defaults to false')
+    parser.add_argument('-b', '--batch_size_for_cuda_graph', type=int, default=1, help='Max batch size for CUDA graph')
+    parser.add_argument('-c', '--chat_template', type=str, default='', help='Chat template to use for the prompt. User input will be injected into {input}. If not set, the prompt is used as is.')
+
     args = parser.parse_args()
     main(args)
\ No newline at end of file
diff --git a/examples/python/model-qa.py b/examples/python/model-qa.py
index 57ec9f6db..c1e628eb5 100644
--- a/examples/python/model-qa.py
+++ b/examples/python/model-qa.py
@@ -14,10 +14,15 @@ def main(args):
     tokenizer_stream = tokenizer.create_stream()
     if args.verbose: print("Tokenizer created")
     if args.verbose: print()
+
     search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
-    if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
-        print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
-        exit(1)
+
+    if args.verbose: print(search_options)
+    
+    if args.chat_template:
+        if args.chat_template.count('{') != 1 or args.chat_template.count('}') != 1:
+            print("Error, chat template must have exactly one pair of curly braces, e.g. '<|user|>\n{input} <|end|>\n<|assistant|>'")
+            exit(1)
 
     # Keep asking for input prompts in a loop
     while True:
diff --git a/examples/python/phi3-qa.py b/examples/python/phi3-qa.py
index 4f175c102..977a47589 100644
--- a/examples/python/phi3-qa.py
+++ b/examples/python/phi3-qa.py
@@ -15,6 +15,12 @@ def main(args):
     if args.verbose: print("Tokenizer created")
     if args.verbose: print()
     search_options = {name:getattr(args, name) for name in ['do_sample', 'max_length', 'min_length', 'top_p', 'top_k', 'temperature', 'repetition_penalty'] if name in args}
+    
+    # Set the max length to something sensible by default, unless it is specified by the user,
+    # since otherwise it will be set to the entire context length
+    if 'max_length' not in search_options:
+        search_options['max_length'] = 2048
+
     chat_template = '<|user|>\n{input} <|end|>\n<|assistant|>'
 
     # Keep asking for input prompts in a loop

From b79467e68d3638fd7c46bd5b228092765da4e387 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 3 May 2024 00:01:52 -0400
Subject: [PATCH 3/8] Exclude libnvinfer_plugin.so.10 (#391)

---
 src/python/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/python/CMakeLists.txt b/src/python/CMakeLists.txt
index bc85d8bce..6dbf82371 100644
--- a/src/python/CMakeLists.txt
+++ b/src/python/CMakeLists.txt
@@ -64,8 +64,12 @@ if(BUILD_WHEEL)
     "libcufft.so.11"
     "libcurand.so.10"
     "libnvinfer.so.8"
+    "libnvinfer.so.10"
     "libnvinfer_plugin.so.8"
+    "libnvinfer_plugin.so.10"
     "libnvonnxparser.so.8"
+    "libnvonnxparser.so.10"
+
   )
   set(modified_exclude_list)
   foreach(item IN LISTS auditwheel_exclude_list)

From 42bd7152eccf1c24c10b6d2531183f3d6777fa2f Mon Sep 17 00:00:00 2001
From: Parinita Rahi <101819959+parinitarahi@users.noreply.github.com>
Date: Thu, 2 May 2024 21:03:23 -0700
Subject: [PATCH 4/8] Update phi-3-tutorial.md (#386)

Replacing the variable with generic text, so users can replace with
model path for appropriate models.
---
 examples/python/phi-3-tutorial.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/python/phi-3-tutorial.md b/examples/python/phi-3-tutorial.md
index e1b4456ab..7c5c689cb 100644
--- a/examples/python/phi-3-tutorial.md
+++ b/examples/python/phi-3-tutorial.md
@@ -74,15 +74,14 @@ Run the model with [phi3-qa.py](https://github.com/microsoft/onnxruntime-genai/b
 
 The script accepts a model folder and takes the generation parameters from the config in that model folder. You can also override the parameters on the command line.
 
-This example is using the long context model running with DirectML on Windows.
+<!--This example is using the long context model running with DirectML on Windows.-->
 
 The `-m` argument is the path to the model you downloaded from HuggingFace above.
 The `-l` argument is the length of output you would like to generate with the model.
 
 ```bash
 curl https://raw.githubusercontent.com/microsoft/onnxruntime-genai/main/examples/python/phi3-qa.py -o phi3-qa.py
-model_path="./Phi-3-mini-128k-instruct-onnx/directml/directml-int4-awq-block-128"
-python phi3-qa.py -m $model_path -l 2048
+python phi3-qa.py -m *replace your relative model_path here* -l 2048
 ```
 
 Once the script has loaded the model, it will ask you for input in a loop, streaming the output as it is produced the model. For example:

From e82ab3d2e6be48c799b0eab097399c92403973a6 Mon Sep 17 00:00:00 2001
From: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com>
Date: Thu, 2 May 2024 23:02:14 -0700
Subject: [PATCH 5/8] Merge tokenizer library invalid UTF-8 fix (#390)

Port over a fix from the onnxruntime-extensions tokenizer library to fix
an invalid UTF-8 issue.
---
 src/tokenizer/token_bpe.cc      | 33 ++++++++++++----
 src/tokenizer/token_bpe.h       |  1 +
 src/tokenizer/tokenizer.cc      |  6 +--
 src/tokenizer/utils/unescape.cc | 69 ++++++++++++++++++++++++---------
 4 files changed, 80 insertions(+), 29 deletions(-)

diff --git a/src/tokenizer/token_bpe.cc b/src/tokenizer/token_bpe.cc
index 93c897eea..80ac9d5bf 100644
--- a/src/tokenizer/token_bpe.cc
+++ b/src/tokenizer/token_bpe.cc
@@ -237,15 +237,17 @@ std::vector<tfmTokenId_t> BPETokenizer::Encode(std::string_view sv_input, int64_
       text = text.strip()
     */
     std::u32string str = RemoveConsecutiveSpaces(input);
-    if (IsUnicodeSpace(str.front())) {
-      str.erase(str.begin());
-    }
-    if (IsUnicodeSpace(str.back())) {
-      str.pop_back();
+    if (!str.empty()) {
+      if (IsUnicodeSpace(str.front())) {
+        str.erase(str.begin());
+      }
+      if (IsUnicodeSpace(str.back())) {
+        str.pop_back();
+      }
+      // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
+      str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
+      str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
     }
-    // remove newlines as CLIP ignores them (treats them as whitespace which is then cleaned)
-    str.erase(std::remove(str.begin(), str.end(), U'\n'), str.end());
-    str.erase(std::remove(str.begin(), str.end(), U'\r'), str.end());
     input = str;
   }
 
@@ -592,6 +594,21 @@ TfmStatus BPETokenizer::Id2Token(tfmTokenId_t id, std::string& token, DecoderSta
         token.push_back(' ');
       }
     }  // end case of whitespace_token_
+
+    bpe_state->incomplete_utf8_ += token;
+    token.clear();
+    std::string& s_utf8 = bpe_state->incomplete_utf8_;
+    size_t utf8_len = 1;
+    size_t utf8_all_len = 0;
+    for (size_t i = 0; i < s_utf8.size(); i += utf8_len) {
+      utf8_len = UTF8Len(s_utf8[i]);
+      if (utf8_len <= s_utf8.size() - i) {
+        utf8_all_len += utf8_len;
+        auto _t = s_utf8.substr(i, utf8_len);
+        token += ValidateUTF8(_t) ? _t : "";
+      }
+    }
+    s_utf8 = s_utf8.substr(utf8_all_len);
   }
 
   return status;
diff --git a/src/tokenizer/token_bpe.h b/src/tokenizer/token_bpe.h
index ed5f1f23c..2327b3a60 100644
--- a/src/tokenizer/token_bpe.h
+++ b/src/tokenizer/token_bpe.h
@@ -28,6 +28,7 @@ class BPETokenizer : public TokenizerImpl {
     BPEDeocerState() = default;
     ~BPEDeocerState() override = default;
     bool f_special_last;
+    std::string incomplete_utf8_;
   };
 
  public:
diff --git a/src/tokenizer/tokenizer.cc b/src/tokenizer/tokenizer.cc
index b2a0622e7..251595856 100644
--- a/src/tokenizer/tokenizer.cc
+++ b/src/tokenizer/tokenizer.cc
@@ -30,10 +30,10 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
   if (type.empty()) {
     if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) {
       type = "BPE";
-    } else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) {
+    } /* else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) {
       // if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model
       type = "SPM";
-    } else {
+    } */ else {
       status = TfmStatus(kTfmErrorInvalidArgument, "Cannot determine the tokenizer type from tokenizer_path argument");
     }
   }
@@ -43,7 +43,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
   } /* else if (type == "SPM") {
     token_ptr = std::make_unique<SpmTokenizer>();
   } */ else {
-    status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, SPM, RKWV) are supported.");
+    status = TfmStatus(kTfmErrorInvalidArgument, "Unknown tokenizer_type, (BPE, RKWV) are supported.");
   }
 
   if (status.ok()) {
diff --git a/src/tokenizer/utils/unescape.cc b/src/tokenizer/utils/unescape.cc
index f42e962f9..f94a1f192 100644
--- a/src/tokenizer/utils/unescape.cc
+++ b/src/tokenizer/utils/unescape.cc
@@ -41,27 +41,60 @@ std::string EncodeUTF8Char(char32_t utf8_char) {
   return {utf8_buf};
 }
 
-bool ValidateUTF8(const std::string& data) {
-  int cnt = 0;
-  for (size_t i = 0; i < data.size(); i++) {
-    int x = data[i];
-    if (!cnt) {
-      if ((x >> 5) == 0b110) {
-        cnt = 1;
-      } else if ((x >> 4) == 0b1110) {
-        cnt = 2;
-      } else if ((x >> 3) == 0b11110) {
-        cnt = 3;
-      } else if ((x >> 7) != 0) {
+ bool ValidateUTF8(const std::string& data) {
+    const unsigned char* s = reinterpret_cast<const unsigned char*>(data.c_str());
+    const unsigned char* s_end = s + data.size();
+    if (*s_end != '\0')
+      return false;
+
+    while (*s) {
+      if (*s < 0x80)
+        /* 0xxxxxxx */
+        s++;
+      else if ((s[0] & 0xe0) == 0xc0) {
+        /* 110XXXXx 10xxxxxx */
+        if (s + 1 >= s_end) {
+          return false;
+        }
+        if ((s[1] & 0xc0) != 0x80 ||
+            (s[0] & 0xfe) == 0xc0) /* overlong? */
+          return false;
+        else
+          s += 2;
+      } else if ((s[0] & 0xf0) == 0xe0) {
+        /* 1110XXXX 10Xxxxxx 10xxxxxx */
+        if (s + 2 >= s_end) {
+          return false;
+        }
+        if ((s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
+            (s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
+            (s[0] == 0xef && s[1] == 0xbf &&
+             (s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */
+          return false;
+        else
+          s += 3;
+      } else if ((s[0] & 0xf8) == 0xf0) {
+        /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
+        if (s + 3 >= s_end) {
+          return false;
+        }
+        if ((s[1] & 0xc0) != 0x80 ||
+            (s[2] & 0xc0) != 0x80 ||
+            (s[3] & 0xc0) != 0x80 ||
+            (s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) ||    /* overlong? */
+            (s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */
+          return false;
+        else
+          s += 4;
+      } else
         return false;
-      }
-    } else {
-      if ((x >> 6) != 0b10) return false;
-      cnt--;
     }
+
+    return true;
   }
-  return cnt == 0;
-}
+
 
 bool IsDigit(char c) { return c >= '0' && c <= '9'; }
 bool IsHexDigit(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); }

From 1f3776d425afbd2e8f83f126f1c02f0d13633ea0 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Thu, 2 May 2024 23:23:58 -0700
Subject: [PATCH 6/8] Fix pad token id in config (#394)

### Description

This PR sets `pad_token_id` in `genai_config.json` to a single value
when a model does not specify a pad token id but it specifies a list of
EOS token ids.

### Motivation and Context

When the pad token id is not specified, `pad_token_id` in
`genai_config.json` stores the same value that `eos_token_id` in
`genai_config.json` contains. When `eos_token_id` has a list of EOS
token ids, then `pad_token_id` also has a list of pad token ids. This
causes a parsing issue in ONNX Runtime GenAI because it expects only one
pad token id.

This PR also fixes [this
issue](https://github.com/microsoft/onnxruntime-genai/issues/384).
---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index fd9c90d53..2f320b729 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -228,7 +228,7 @@ def make_genai_config(self, model_name_or_path, extra_kwargs, out_dir):
                     "num_key_value_heads": self.num_kv_heads,
                 },
                 "eos_token_id": config.eos_token_id,
-                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id,
+                "pad_token_id": config.pad_token_id if hasattr(config, "pad_token_id") and config.pad_token_id is not None else config.eos_token_id[0] if isinstance(config.eos_token_id, list) else config.eos_token_id,
                 "type": self.model_type[ : self.model_type.find("For")].lower(),
                 "vocab_size": self.vocab_size,
             },

From 88d46dd30c34af12f31c2d587fd959e8534ade8c Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Fri, 3 May 2024 09:26:06 -0700
Subject: [PATCH 7/8] Use std::experimental::filesystem instead of
 std::filesystem (#393)

---
 CMakeLists.txt                  |  1 +
 cmake/cxx_standard.cmake        |  6 +++++-
 src/config.cpp                  |  4 ++--
 src/config.h                    |  4 ++--
 src/filesystem.h                | 11 +++++++++++
 src/generators.h                |  3 ++-
 src/logging.cpp                 |  2 +-
 src/models/model.cpp            |  2 +-
 src/tokenizer/c_api/tfmtok_c.cc |  4 ++--
 src/tokenizer/config.cc         |  5 ++---
 src/tokenizer/tokenizer.cc      |  4 ++--
 11 files changed, 31 insertions(+), 15 deletions(-)
 create mode 100644 src/filesystem.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ac13d590..8926e8a6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,7 @@ message("Building onnxruntime-genai for version ${VERSION_INFO}")
 # Checking if CUDA is supported
 include(CheckLanguage)
 add_compile_definitions(BUILDING_ORT_GENAI_C)
+
 if(USE_CUDA)
   check_language(CUDA)
   if(CMAKE_CUDA_COMPILER)
diff --git a/cmake/cxx_standard.cmake b/cmake/cxx_standard.cmake
index 7e752d40b..52732c2f2 100644
--- a/cmake/cxx_standard.cmake
+++ b/cmake/cxx_standard.cmake
@@ -9,4 +9,8 @@ elseif (USE_CUDA AND CMAKE_CUDA_COMPILER AND CMAKE_CUDA_COMPILER_VERSION VERSION
 else ()
     message("Test is using C++20")
     set(CMAKE_CXX_STANDARD 20)
-endif ()
\ No newline at end of file
+endif ()
+
+if ("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9)
+    add_compile_definitions(USE_EXPERIMENTAL_FILESYSTEM)
+endif()
\ No newline at end of file
diff --git a/src/config.cpp b/src/config.cpp
index 39341f5b5..7dc3133ec 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -397,7 +397,7 @@ struct RootObject_Element : JSON::Element {
   JSON::Element& t_;
 };
 
-void ParseConfig(const std::filesystem::path& filename, Config& config) {
+void ParseConfig(const fs::path& filename, Config& config) {
   std::ifstream file(filename, std::ios::binary | std::ios::ate);
   if (!file.is_open()) {
     throw std::runtime_error("Error opening " + filename.string());
@@ -421,7 +421,7 @@ void ParseConfig(const std::filesystem::path& filename, Config& config) {
   }
 }
 
-Config::Config(const std::filesystem::path& path) : config_path{path} {
+Config::Config(const fs::path& path) : config_path{path} {
   ParseConfig(path / "genai_config.json", *this);
 
   if (model.context_length == 0)
diff --git a/src/config.h b/src/config.h
index b94e05ca0..8fb5debdc 100644
--- a/src/config.h
+++ b/src/config.h
@@ -6,9 +6,9 @@ namespace Generators {
 
 struct Config {
   Config() = default;
-  Config(const std::filesystem::path& path);
+  Config(const fs::path& path);
 
-  std::filesystem::path config_path;  // Path of the config directory
+  fs::path config_path;  // Path of the config directory
 
   using ProviderOption = std::pair<std::string, std::string>;
   struct ProviderOptions {
diff --git a/src/filesystem.h b/src/filesystem.h
new file mode 100644
index 000000000..45c4c7015
--- /dev/null
+++ b/src/filesystem.h
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// TODO(baijumeswani): Remove experimental when packaging pipeline can use GCC > 8
+#ifdef USE_EXPERIMENTAL_FILESYSTEM
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+#include <filesystem>
+namespace fs = std::filesystem;
+#endif
diff --git a/src/generators.h b/src/generators.h
index e6ad6f0e1..3f7c1d8e0 100644
--- a/src/generators.h
+++ b/src/generators.h
@@ -5,8 +5,9 @@
 #include <assert.h>
 #include <cmath>
 #include <cstring>
-#include <filesystem>
+#include "filesystem.h"
 #include <functional>
+#include <iostream>
 #include "span.h"
 #include <memory>
 #include <numeric>
diff --git a/src/logging.cpp b/src/logging.cpp
index 6c334f50a..edd698168 100644
--- a/src/logging.cpp
+++ b/src/logging.cpp
@@ -44,7 +44,7 @@ void SetLogString(std::string_view name, std::string_view value) {
     if (value.empty())
       gp_logfile.reset();
     else {
-      std::filesystem::path filename{value};
+      fs::path filename{std::string(value)};
       gp_logfile = std::make_unique<std::ofstream>(filename);
     }
 
diff --git a/src/models/model.cpp b/src/models/model.cpp
index 35a9b4ad4..d760824b3 100644
--- a/src/models/model.cpp
+++ b/src/models/model.cpp
@@ -297,7 +297,7 @@ void Model::CreateSessionOptions() {
   }
 
   if (options.enable_profiling.has_value()) {
-    std::filesystem::path profile_file_prefix{options.enable_profiling.value()};
+    fs::path profile_file_prefix{options.enable_profiling.value()};
     ort_options.EnableProfiling(profile_file_prefix.c_str());
   }
 
diff --git a/src/tokenizer/c_api/tfmtok_c.cc b/src/tokenizer/c_api/tfmtok_c.cc
index 02c57ce65..3dc9be009 100644
--- a/src/tokenizer/c_api/tfmtok_c.cc
+++ b/src/tokenizer/c_api/tfmtok_c.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include <cstdarg>
-#include <filesystem>
+#include "../filesystem.h"
 #include <algorithm>
 
 #include "tfmtok.h"
@@ -117,7 +117,7 @@ tfmError_t TFM_API_CALL TfmCreateTokenizer(TfmTokenizer** tokenizer,
     return kTfmErrorInvalidArgument;
   }
 
-  if (!std::filesystem::is_directory(tokenizer_path)) {
+  if (!fs::is_directory(tokenizer_path)) {
     last_error_message = std::string("Cannot find the directory of ") + tokenizer_path;
     return kTfmErrorInvalidArgument;
   }
diff --git a/src/tokenizer/config.cc b/src/tokenizer/config.cc
index dbc0908cf..a40b7d7db 100644
--- a/src/tokenizer/config.cc
+++ b/src/tokenizer/config.cc
@@ -4,7 +4,7 @@
 #include <string>
 #include <fstream>
 #include <streambuf>
-#include <filesystem>
+#include "../filesystem.h"
 
 #include "config.h"
 
@@ -68,8 +68,7 @@ TfmStatus TokenConfig::LoadJson(const std::string& json_path) {
   simdjson::dom::parser parser;
   simdjson::dom::element root;
 
-  if (!std::filesystem::exists(
-          std::filesystem::path(json_path).lexically_normal())) {
+  if (!fs::exists(fs::path(json_path))) {
     return {kTfmErrorInvalidFile, std::string(json_path) + " not found"};
   }
   std::string json_text = PatchJsonText(json_path);
diff --git a/src/tokenizer/tokenizer.cc b/src/tokenizer/tokenizer.cc
index 251595856..4f52acd72 100644
--- a/src/tokenizer/tokenizer.cc
+++ b/src/tokenizer/tokenizer.cc
@@ -1,7 +1,7 @@
 #include "token_bpe.h"
 #include "token_rwkv.h"
 
-#include <filesystem>
+#include "../filesystem.h"
 #include <memory>
 
 namespace tfm {
@@ -30,7 +30,7 @@ TfmStatus CreateBPETokenizer(const std::string& tokenizer_path,
   if (type.empty()) {
     if (BPETokenizer::IsSupportedModel(GetModelName(token_cfg->tokenizer_class_))) {
       type = "BPE";
-    } /* else if (std::filesystem::exists(tokenizer_path + "/tokenizer.model")) {
+    } /* else if (fs::exists(tokenizer_path + "/tokenizer.model")) {
       // if 'tokenizer.model exists in the tokenizer_path, then it is a sentencepiece model
       type = "SPM";
     } */ else {

From b272ba45ca0c1dc62cfb35f4675b6683b421ebd5 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Fri, 3 May 2024 10:43:18 -0700
Subject: [PATCH 8/8] update GQA message (#396)

---
 src/python/py/models/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 2f320b729..940f76e55 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -171,7 +171,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         if (self.ep in {"cuda", "dml"} and self.io_dtype == TensorProto.FLOAT16) or (enable_GQA_on_CPU and self.ep == "cpu" and self.io_dtype == TensorProto.FLOAT):
             # Change model settings for GroupQueryAttention
             self.attention_attrs["op_type"] = "GroupQueryAttention"
-            print("GroupQueryAttention (GQA) is used in this model. GQA is currently supported only for INT4 and FP16 on the CUDA and DML execution providers.")
+            print("GroupQueryAttention (GQA) is used in this model.")
 
             # DML doesn't support packed Q/K/V for GQA yet
             self.attention_attrs["use_packed_matmul"] = self.ep != "dml" and self.num_attn_heads == self.num_kv_heads