From 37ec77fca25b500d3903a549378738dae061f3b2 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Tue, 19 Nov 2024 12:20:51 -0800
Subject: [PATCH] Multilora docs (#22865)

---
 docs/genai/api/c.md              |  75 +++++++++++++-
 docs/genai/api/csharp.md         |  86 ++++++++++++++++
 docs/genai/api/java.md           |   4 +
 docs/genai/api/python.md         |  55 ++++++++++-
 docs/genai/reference/adapter.md  |  66 +++++++++++++
 docs/genai/tutorials/finetune.md | 165 +++++++++++++++++++++++++++++++
 6 files changed, 449 insertions(+), 2 deletions(-)
 create mode 100644 docs/genai/reference/adapter.md
 create mode 100644 docs/genai/tutorials/finetune.md

diff --git a/docs/genai/api/c.md b/docs/genai/api/c.md
index c08c1936f125b..b739ef7c3314a 100644
--- a/docs/genai/api/c.md
+++ b/docs/genai/api/c.md
@@ -280,7 +280,7 @@ OGA_EXPORT OgaResult* OGA_API_CALL OgaGeneratorParamsSetInputSequences(OgaGenera
 
 ### Set model input
 
-Set an additional model input, aside from the input_ids. For example additional inputs for LoRA adapters.
+Set an additional model input, aside from the input_ids.
 
 ### Parameters
 
@@ -433,6 +433,79 @@ More details on the current runtime options can be found [here](https://github.c
 OGA_EXPORT void OGA_API_CALL OgaGenerator_SetRuntimeOption(OgaGenerator* generator, const char* key, const char* value);
 ```
 
+## Adapter API
+
+This API is used to load and switch fine-tuned adapters, such as LoRA adapters.
+
+### Create adapters
+
+Creates the object that manages the adapters. This object is used to load all the model adapters. It is responsible for reference counting the loaded adapters.
+
+```c
+OGA_EXPORT OgaResult* OGA_API_CALL OgaCreateAdapters(const OgaModel* model, OgaAdapters** out);
+```
+
+#### Parameters
+
+* model: the `OgaModel`, which has previously been created
+
+#### Results
+
+* out: a reference to the list of `OgaAdapters` created
+
+### Load adapter
+
+Loads the model adapter from the given adapter file path and adapter name.
+
+```c
+OGA_EXPORT OgaResult* OGA_API_CALL OgaLoadAdapter(OgaAdapters* adapters, const char* adapter_file_path, const char* adapter_name);
+```
+
+#### Parameters
+
+ * `adapters`: The OgaAdapters object into which to load the adapter.
+ * `adapter_file_path`: The file path of the adapter to load.
+ * `adapter_name`: A unique identifier for the adapter to be used for adapter querying
+
+#### Return value
+
+`OgaResult` containing an error message if the adapter failed to load.
+
+### Unload adapter
+
+Unloads the adapter with the given identifier from the set of previously loaded adapters. If the adapter is not found, or if it cannot be unloaded (when it is in use), an error is returned.
+
+```c
+OGA_EXPORT OgaResult* OGA_API_CALL OgaUnloadAdapter(OgaAdapters* adapters, const char* adapter_name);
+```
+
+#### Parameters
+
+* `adapters`: The OgaAdapters object from which to unload the adapter.
+*  `adapter_name`: The name of the adapter to unload.
+
+#### Return value
+
+`OgaResult` containing an error message if the adapter failed to unload. This can occur if the method is called with an adapter that is not already loaded or has been marked active by a `OgaGenerator` still in use.
+
+### Set active adapter
+
+Sets the adapter with the given adapter name as active for the given OgaGenerator object.
+
+```c
+OGA_EXPORT OgaResult* OGA_API_CALL OgaSetActiveAdapter(OgaGenerator* generator, OgaAdapters* adapters, const char* adapter_name);
+```
+
+#### Parameters
+
+* `generator`: The OgaGenerator object to set the active adapter.
+* `adapters`: The OgaAdapters object that manages the model adapters.
+* `adapter_name`: The name of the adapter to set as active.
+
+#### Return value
+
+`OgaResult` containing an error message if the adapter failed to be set as active. This can occur if the method is called with an adapter that has not been previously loaded.
+
 ## Enums and structs
 
 ```c
diff --git a/docs/genai/api/csharp.md b/docs/genai/api/csharp.md
index 915e6e2bf5787..bc9b478440449 100644
--- a/docs/genai/api/csharp.md
+++ b/docs/genai/api/csharp.md
@@ -155,6 +155,33 @@ public void GenerateNextToken()
 public ReadOnlySpan<int> GetSequence(ulong index)
 ```
 
+### Set active adapter
+
+Sets the active adapter on this Generator instance.
+
+```csharp
+using var model = new Model(modelPath);
+using var genParams = new GeneratorParams(model);
+using var generator = new Generator(model, genParams);
+using var adapters = new Adapters(model);
+string adapterName = "..."
+
+generator.SetActiveAdapter(adapters, adapterName);
+```
+
+#### Parameters
+
+* `adapters`: the previously created `Adapter` object
+* `adapterName`: the name of the adapter to activate
+
+#### Return value
+
+`void`
+
+#### Exception
+
+Throws on error.
+
 ## Sequences class
 
 ### Num sequences member
@@ -169,3 +196,62 @@ public ulong NumSequences { get { return _numSequences; } }
 public ReadOnlySpan<int> this[ulong sequenceIndex]
 ```
 
+## Adapter class
+
+This API is used to load and switch fine-tuned adapters, such as LoRA adapters.
+
+### Constructor
+
+Construct an instance of an Adapter class.
+
+```csharp
+using var model = new Model(modelPath);
+
+using var adapters = new Adapters(model);
+```
+
+#### Parameters
+
+* `model`: a previously constructed model class
+
+### Load Adapter method
+
+Loads an adapter file from disk.
+
+```csharp
+string adapterPath = Path()
+string adapterName = ...
+
+adapters.LoadAdapter(adapterPath, adapterName);
+```
+
+#### Parameters
+
+* `adapterPath`: the path to the adapter file on disk
+* `adapterName`: a string identifier used to refer to the adapter in subsequent methods
+
+#### Return value
+
+`void`
+
+### Unload Adapter method
+
+Unloads an adapter file from memory.
+
+```csharp
+adapters.UnLoadAdapter(adapterName);
+```
+
+#### Parameters
+
+* `adapterName`: the name of the adapter to unload
+
+#### Return value
+
+`void`
+
+#### Execption
+
+Throws an exception on error.
+
+
diff --git a/docs/genai/api/java.md b/docs/genai/api/java.md
index 47e654566c569..322ddead8970f 100644
--- a/docs/genai/api/java.md
+++ b/docs/genai/api/java.md
@@ -610,3 +610,7 @@ public int[] getSequence(long sequenceIndex)
 
 The sequence as an array of integers.
 
+
+## Adapter class
+
+_Coming very soon!_
diff --git a/docs/genai/api/python.md b/docs/genai/api/python.md
index fa72a03617623..31de33bf68773 100644
--- a/docs/genai/api/python.md
+++ b/docs/genai/api/python.md
@@ -316,4 +316,57 @@ Returns
 onnxruntime_genai.Generator.get_sequence(index: int) -> numpy.ndarray[numpy.int32] 
 ```
 
-- `index`: (Required) The index of the sequence in the batch to return
\ No newline at end of file
+- `index`: (Required) The index of the sequence in the batch to return
+
+## Adapter class
+
+### Create
+
+Create an Adapters object, using a model that has been loaded.
+
+```python
+model = ...
+adapters = og.Adapters(model)
+```
+
+#### Parameters
+
+* `model`: the model that the adapters will be used with
+
+#### Return value
+
+An `Adapter` object
+
+### Load
+
+Load an adapter from disk into an Adapter object in memory.
+
+```python
+onnxruntime_genai.Adapters(file: str, name: str) -> None
+```
+
+#### Parameters
+
+* `file`: the location on disk from which to load the adapter
+* `name`: the name of the adapter
+
+#### Return value
+
+None
+
+### Set active adapter
+
+Sets the actove adapter on a `Generator` object.
+
+```python
+onnxruntime_genai.Generator(adapters: Generators::Adapters, adapter: str) -> None
+```
+
+#### Parameters
+
+* `adapters`: the adapters object, which has had the identified adapter loading into it
+* `adapter`: the name of the adapter to set as active
+
+#### Return value
+
+None
\ No newline at end of file
diff --git a/docs/genai/reference/adapter.md b/docs/genai/reference/adapter.md
new file mode 100644
index 0000000000000..71b4ae579f2c2
--- /dev/null
+++ b/docs/genai/reference/adapter.md
@@ -0,0 +1,66 @@
+---
+title: Adapter file spec
+description: Specification for the .onnx_adapter file format
+has_children: false
+parent: Reference
+grand_parent: Generate API (Preview)
+nav_order: 2
+---
+
+# Adapter file specification
+
+
+## File format
+
+The adapter file format is flatbuffers
+
+## File extension
+
+The file extension is ".onnx_adapter"
+
+## Schema
+
+Link to live [schema definition](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/lora/adapter_format/adapter_schema.fbs).
+
+The schema definition is as follows
+
+```
+File:= 
+  format_version := integer
+  adapter_version := integer
+  model_version := integer
+  [parameter := Parameter]
+```
+
+```
+Parameter:=
+  name := string
+  dimensions := [int64]
+  data_type := TensorDataType
+  [data := uint8] 
+```
+
+```
+TensorDataType:= 
+  UNDEFINED = 0 |
+  FLOAT = 1 |
+  UINT8 = 2 |
+  INT8 = 3 |
+  UINT16 = 4 |
+  INT16 = 5 |
+  INT32 = 6 |
+  INT64 = 7 |
+  STRING = 8 |
+  BOOL = 9 | 
+  FLOAT16 = 10 |
+  DOUBLE = 11 |
+  UINT32 = 12 |
+  UINT64 = 13 |
+  COMPLEX64 = 14 |
+  COMPLEX128 = 15 |
+  BFLOAT16 = 16 |
+  FLOAT8E4M3FN = 17 |
+  FLOAT8E4M3FNUZ = 18 |
+  FLOAT8E5M2 = 19 |
+  FLOAT8E5M2FNUZ = 20
+```
\ No newline at end of file
diff --git a/docs/genai/tutorials/finetune.md b/docs/genai/tutorials/finetune.md
new file mode 100644
index 0000000000000..5d0302b896dfc
--- /dev/null
+++ b/docs/genai/tutorials/finetune.md
@@ -0,0 +1,165 @@
+---
+title: Run with LoRA adapters
+description: Use Olive and ONNX Runtime to generate and run fine-tuned LoRA adapters
+has_children: false
+parent: Tutorials
+grand_parent: Generate API (Preview)
+nav_order: 4
+---
+
+# Generate and run fine-tuned models with LoRA adapters
+{: .no_toc }
+
+Learn how to generate models and adapters in formats suitable for executing with ONNX Runtime.
+
+LoRA stands for Low Rank Adaptation. It is a popular method of fine-tuning that freezes some layers in a graph and provides the values of the weights of the variable layers in an artifact called an adapter. 
+
+Multi LoRA uses multiple adapters at runtime to run different fine-tunings of the same model. The adapter could be per-scenario, per-tenant/customer, or per-user i.e. there could be just a few adapters to many hundreds or thousands.
+
+Olive generates models and adapters in ONNX format. These models and adapters can then be run with ONNX Runtime.
+
+## Setup
+
+1. Install Olive
+   
+   This installs Olive from main. Replace with version 0.8.0 when it is released.
+
+   ```bash
+   pip install git+https://github.com/microsoft/olive
+   ```
+
+2. Install ONNX Runtime generate()
+
+   ```
+   pip install onnxruntime-genai
+   ```
+
+3. Install other dependencies
+
+   ```bash
+   pip install optimum peft
+   ```
+
+4. Downgrade torch and transformers
+
+   TODO: There is an export bug with torch 2.5.0 and an incompatibility with transformers>=4.45.0
+
+   ```bash
+   pip uninstall torch
+   pip install torch==2.4
+   pip uninstall transformers
+   pip install transformers==4.44
+   ```
+   
+5. Choose a model
+
+   You can use a model from HuggingFace, or your own model. The model must be a PyTorch model.
+   
+6. Decide whether you are fine-tuning your model, or using a pre-existing adapter
+
+   There are many pre-existing adapters on HuggingFace. If you are using multiple different adapters, these must all use the same fine-tuned layers of the original model.
+
+## Generate model and adapters in ONNX format
+
+1. If fine-tuning, run Olive to fine-tune your model
+
+   Note: this operations requires a system with an NVIDIA GPU, with CUDA installed
+
+   Use the `olive fine-tune` command: https://microsoft.github.io/Olive/features/cli.html#finetune
+
+   Here is an example usage of the command:
+
+   ```bash
+   olive finetune --method qlora -m meta-llama/Meta-Llama-3-8B -d nampdn-ai/tiny-codes --train_split "train[:4096]" --eval_split "train[4096:4224]" --text_template "### Language: {programming_language} \n### Question: {prompt} \n### Answer: {response}" --per_device_train_batch_size 16 --per_device_eval_batch_size 16 --max_steps 150 --logging_steps 50 -o adapters\tiny-codes
+   ```
+
+2. Optionally, quantize your model
+
+   Use the `olive quantize` command: https://microsoft.github.io/Olive/features/cli.html#quantize
+
+
+3. Generate the ONNX model and adapter using the quantized model
+
+   Use the `olive auto-opt` command for this step: https://microsoft.github.io/Olive/features/cli.html#auto-opt
+
+   The `--adapter path` can either be a HuggingFace adapter reference, or a path to the adapter you fine-tuned above.
+
+   The `--provider` argument can be an ONNX Runtime execution provider.
+
+   ```bash
+   olive auto-opt -m <path to your model folder> --adapter_path <path to your adapter> -o <output model folder> --device cpu\|gpu --provider <provider> 
+   ```
+
+4. Convert adapters to `.onnx_adapter` format
+
+   Run this step once for each adapter that you have generated.
+
+   ```bash
+   olive convert-adapters --adapter_path <path to your fine-tuned adapter --output_path <path to .onnx_adapter location --dtype float32
+   ```
+
+## Write your application
+
+This example is shown in Python, but you can also use the C/C++ API, the C# API, and the Java API (_coming soon!_)
+
+```python
+import onnxruntime_genai as og
+import numpy as np
+import argparse
+
+parser = argparse.ArgumentParser(description='Application to load and switch ONNX LoRA adapters')
+parser.add_argument('-m', '--model', type=str, help='The ONNX base model')
+parser.add_argument('-a', '--adapters', nargs='+', type=str, help='List of adapters in .onnx_adapters format')
+parser.add_argument('-t', '--template', type=str, help='The template with which to format the prompt')
+parser.add_argument('-s', '--system', type=str, help='The system prompt to pass to the model')
+parser.add_argument('-p', '--prompt', type=str, help='The user prompt to pass to the model')
+args = parser.parse_args()
+
+model = og.Model(args.model)
+if args.adapters:
+    adapters = og.Adapters(model)
+    for adapter in args.adapters:
+        adapters.load(adapter, adapter)
+
+tokenizer = og.Tokenizer(model)
+tokenizer_stream = tokenizer.create_stream()
+
+prompt = args.template.format(system=args.system, input=args.prompt)
+
+params = og.GeneratorParams(model)
+params.set_search_options(max_length=2048, past_present_share_buffer=False)
+# This input is generated for transformers versions > 4.45
+#params.set_model_input("onnx::Neg_67", np.array(0, dtype=np.int64))
+params.input_ids = tokenizer.encode(prompt)
+
+generator = og.Generator(model, params)
+
+if args.adapters:
+   for adapter in args.adapters:
+      print(f"[{adapter}]: {prompt}")
+      generator.set_active_adapter(adapters, adapter)
+
+      while not generator.is_done():
+        generator.compute_logits()
+        generator.generate_next_token()
+
+        new_token = generator.get_next_tokens()[0]
+        print(tokenizer_stream.decode(new_token), end='', flush=True)
+else:
+    print(f"[Base]: {prompt}")
+
+    while not generator.is_done():
+       generator.compute_logits()
+       generator.generate_next_token()
+```
+
+## Call the application
+
+```bash
+python app.py -m <model folder> -a <.onnx_adapter files> -t <prompt template> -s <systemm prompt> -p <prompt>
+```
+
+## References
+
+* [Python API docs](../api/python.md#adapter-class)
+* [Olive CLI docs](https://microsoft.github.io/Olive/features/cli.html)