From 18c29ff12e4b765c3682a1b587c81c90b26a08b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=BB=84=E5=AE=87=E6=89=AC?= <huang.yuyang@think-force.com>
Date: Tue, 23 Jul 2024 10:55:50 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=87=AA=E5=AE=9A=E4=B9=89?=
 =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E6=96=87=E6=A1=A3?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md         |   5 +-
 docs/custom.md    | 108 +++++++++++++++++++++++
 docs/custom_op.md | 212 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 324 insertions(+), 1 deletion(-)
 create mode 100644 docs/custom.md
 create mode 100644 docs/custom_op.md

diff --git a/README.md b/README.md
index 72464589..a99be62f 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ fastllm是纯c++实现，无第三方依赖的多平台高性能大模型推理
 - 🚀 支持动态Batch，流式输出
 - 🚀 前后端分离设计，便于支持新的计算设备
 - 🚀 目前支持ChatGLM系列模型，Qwen系列模型，各种LLAMA模型(ALPACA, VICUNA等)，BAICHUAN模型，MOSS模型，MINICPM模型等
+- 🚀 支持Python自定义模型结构
 
 ## 快速开始
 
@@ -69,12 +70,14 @@ python3 -m ftllm.webui -t 16 -p ~/Qwen2-7B-Instruct/ --port 8080
 
 一些早期的HuggingFace模型无法直接读取，可以参考 [模型转换](docs/models.md#模型导出convert-offline) 转换fastllm格式的模型
 
+可以自定义模型结构，具体见 [自定义模型](docs/custom_model.md)
+
 ### 运行demo程序 (c++)
 
 ```
 # 进入fastllm/build-fastllm目录
 
-# 命令行聊天程序, 支持打字机效果 (只支持Linux）
+# 命令行聊天程序, 支持打字机效果
 ./main -p ~/Qwen2-7B-Instruct/ 
 
 # 简易webui, 使用流式输出 + 动态batch，可多路并发访问
diff --git a/docs/custom.md b/docs/custom.md
new file mode 100644
index 00000000..a0b956af
--- /dev/null
+++ b/docs/custom.md
@@ -0,0 +1,108 @@
+对于Fastllm框架中没有支持的模型，可以通过自定义模型结构来支持
+
+Pyhton 自定义模型只需要一个python文件来描述模型结构，可参考 [QWEN](../example/python/qwen2.py) 中的实现
+
+### Python自定义模型的使用
+
+使用ftllm.chat, ftllm.webui, ftllm.server时，可以加入参数--custom来指定自定义模型文件
+
+假设我们的模型位于 "~/Qwen2-7B-Instruct/" 目录，自定义模型位于 "~/qwen2.py"
+
+那么可以使用命令
+
+``` sh
+python3 -m ftllm.chat -t 16 -p ~/Qwen2-7B-Instruct/ --custom ~/qwen2.py 
+```
+
+来通过自定义模型文件加在Qwen2模型，server和webui用法类似
+
+### Python自定义模型的写法
+
+自定义模型时，需要实现一个模型的描述类，继承自ftllm.llm.ComputeGraph
+
+对应 [QWEN](../example/python/qwen2.py) 中的代码
+
+``` python
+from ftllm.llm import ComputeGraph
+class Qwen2Model(ComputeGraph):
+```
+
+文件最后需要定义 `__model__` 变量来指定自定义模型结构对应的class, 对应代码
+
+``` python
+__model__ = Qwen2Model
+```
+
+模型描述类中需要实现build方法，来获取模型参数、描述计算流程
+
+这里以示例代码为例介绍
+
+``` python
+class Qwen2Model(ComputeGraph):
+    def build(self):
+        # 1. 获取weight, data, config
+        weight, data, config = self.weight, self.data, self.config
+
+        # 2. 设置一些config
+        config["max_positions"] = 128000
+
+        # 3. 描述计算流程
+        head_dim = config["hidden_size"] // config["num_attention_heads"]
+        self.Embedding(data["inputIds"], weight["model.embed_tokens.weight"], data["hiddenStates"]);
+        # 以下是计算流程，具体参见示例代码
+```
+
+#### `self.config`
+
+模型配置，默认会从模型文件夹下的 `config.json` 文件中读取
+
+build方法中可以修改config中的参数，例如改动 `max_positions` 可以修改上下文长度
+
+有一些模型的 `config.json` 中使用的变量名不一致，需要在build过程中手动为config赋值。
+
+例如在TeleChat7B模型的配置中没有 `max_positions` 变量，而是用 `seq_length` 变量代表长度，那么在build方法中需要用如下代码赋值：
+
+``` python 
+self.config["max_positions"] = self.config["seq_length"]
+```
+
+config中，有以下变量必须要赋值（如果config.json中变量名一致，可以不处理）：
+
+``` python
+self.config["max_positions"] #代表最长上下文长度
+```
+
+#### `self.weight`
+
+代表权重数据
+
+`self.weight[weightName]` 代表模型文件中名为weightName的参数（对应HF模型文件夹中.safetensors文件中的参数名）
+
+#### ```self.data```
+
+代表计算流程的中间变量和输入变量
+
+`self.data[dataName]` 代表名为dataName的中间变量，`dataName` 可以使用除以下输入变量名之外的任意字符串
+
+输入变量：
+
+``` python
+data["inputIds"] # 输入token
+data["positionIds"] # 位置信息
+data["attentionMask"] # mask信息
+data["sin"] # 用于旋转编码的sin
+data["cos"] # 用于旋转编码的cos
+data["atype"] # 推理中的数据类型
+data["pastKey."][i] # 第i个block的key cache
+data["pastValue."][i] # 第i个block的value cache
+```
+
+#### 计算流程及算子
+
+使用基类ComputeGraph添加算子的函数来描述计算流程
+
+目前支持的算子见文档 [自定义模型算子](./custom_op.md)
+
+### cpp版本的自定义模型
+
+（cpp版本的自定义模型接口还在修改中...）
diff --git a/docs/custom_op.md b/docs/custom_op.md
new file mode 100644
index 00000000..b5eb2b3f
--- /dev/null
+++ b/docs/custom_op.md
@@ -0,0 +1,212 @@
+## 自定义模型算子文档
+
+### `AddTo`
+```python
+def AddTo(self, input0, input1, alpha = 1.0):
+    """
+    将两个输入节点相加，并乘以一个可选的缩放因子 alpha。
+
+    参数:
+    input0 (GraphNode): 第一个输入节点。
+    input1 (GraphNode): 第二个输入节点。
+    alpha (float, optional): 缩放因子，默认为 1.0。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "AddTo", 
+                       "nodes": {"input0": input0, "input1": input1, "alpha": FloatGraphNode(alpha)}})
+```
+
+### `DataTypeAs`
+```python
+def DataTypeAs(self, input, input1):
+    """
+    将输入节点的数据类型转换为另一个输入节点的数据类型。
+
+    参数:
+    input (GraphNode): 需要转换数据类型的输入节点。
+    input1 (GraphNode): 目标数据类型的输入节点。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "DataTypeAs", 
+                       "nodes": {"input": input, "input1": input1}})
+```
+
+### `Embedding`
+```python
+def Embedding(self, input, weight, output):
+    """
+    执行嵌入操作，将输入索引映射到嵌入权重。
+
+    参数:
+    input (GraphNode): 输入索引节点。
+    weight (GraphNode): 嵌入权重节点。
+    output (GraphNode): 输出节点。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "Embedding", 
+                       "nodes": {"input": input, "weight": weight, "output": output}})
+```
+
+### `ExpandHead`
+```python
+def ExpandHead(self, input, headDim):
+    """
+    把input最后一维展开成[-1, headDim]。
+
+    参数:
+    input (GraphNode): 输入节点。
+    headDim (int): 头部维度大小。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "ExpandHeads", 
+                       "nodes": {"input": input, "headDim": IntGraphNode(headDim)}})
+```
+
+### `FusedAttention`
+```python
+def FusedAttention(self, q, k, v, curk, curv, original, mask, output, seqLens, 
+                   scale, maskType=0, unitLen=128):
+    """
+    执行Attention操作。
+
+    参数:
+    q (GraphNode): 查询节点。
+    k (GraphNode): key cache 
+    v (GraphNode): value cache 
+    curk (GraphNode): 当前key
+    curv (GraphNode): 当前value
+    original (GraphNode): 原始节点，用于恢复计算后的shape
+    mask (GraphNode): 掩码
+    output (GraphNode): 输出
+    seqLens (GraphNode): 序列长度
+    scale (float): 缩放因子
+    maskType (int, optional): 掩码类型，默认为 0。
+    unitLen (int, optional): 单元长度，默认为 128。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "FusedAttention", 
+                       "nodes": {"q": q, "k": k, "v": v, "curk": curk, "curv": curv, 
+                                "original": original, "mask": mask, "output": output, "seqLens": seqLens, 
+                                 "scale": FloatGraphNode(scale), 
+                                 "maskType": IntGraphNode(maskType), "unitLen": IntGraphNode(unitLen)}})
+```
+
+### `Linear`
+```python
+def Linear(self, input, weight, bias, output):
+    """
+    执行线性变换操作。
+
+    参数:
+    input (GraphNode): 输入节点。
+    weight (GraphNode): 权重节点。
+    bias (GraphNode): 偏置节点。
+    output (GraphNode): 输出节点。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "Linear", 
+                       "nodes": {"input": input, "weight": weight, "bias": bias, "output": output}})
+```
+
+### `LlamaRotatePosition2D`
+```python
+def LlamaRotatePosition2D(self, input, positionIds, sin, cos, rotaryDim):
+    """
+    执行 Llama 模型的二维位置旋转操作。
+
+    参数:
+    input (GraphNode): 输入节点。
+    positionIds (GraphNode): 位置 ID 节点。
+    sin (GraphNode): 正弦节点。
+    cos (GraphNode): 余弦节点。
+    rotaryDim (int): 旋转维度大小。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "LlamaRotatePosition2D", 
+                       "nodes": {"input": input, "positionIds": positionIds, "sin": sin, "cos": cos, "rotaryDim": IntGraphNode(rotaryDim)}})
+```
+
+### `MulTo`
+```python
+def MulTo(self, input0, input1):
+    """
+    将两个输入节点相乘。
+
+    参数:
+    input0 (GraphNode): 第一个输入节点。
+    input1 (GraphNode): 第二个输入节点。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "MulTo", 
+                       "nodes": {"input0": input0, "input1": input1}})
+```
+
+### `RMSNorm`
+```python
+def RMSNorm(self, input, weight, eps, output):
+    """
+    执行 RMS 归一化操作。
+
+    参数:
+    input (GraphNode): 输入节点。
+    weight (GraphNode): 权重节点。
+    eps (float): 小常数，用于防止除零错误。
+    output (GraphNode): 输出节点。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "RMSNorm", 
+                       "nodes": {"input": input, "weight": weight, "eps": FloatGraphNode(eps), "output": output}})
+```
+
+### `Silu`
+```python
+def Silu(self, input, output):
+    """
+    执行 SiLU（Sigmoid Linear Unit）激活函数操作。
+
+    参数:
+    input (GraphNode): 输入节点。
+    output (GraphNode): 输出节点。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "Silu", 
+                       "nodes": {"input": input, "output": output}})
+```
+
+### `SplitLastTokenStates`
+```python
+def SplitLastTokenStates(self, input, seqLens, output):
+    """
+    分割batch输入中每个batch的最后一个 token 状态。
+
+    参数:
+    input (GraphNode): 输入节点。
+    seqLens (GraphNode): 序列长度节点。
+    output (GraphNode): 输出节点。
+
+    返回:
+    无返回值，结果存储在内部图结构中。
+    """
+    self.graph.append({"type": "SplitLastTokenStates", 
+                       "nodes": {"input": input, "output": output, "seqLens": seqLens}})
+```
\ No newline at end of file