diff --git a/README.md b/README.md index 0c84df13..4c5f3e92 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # fastllm +[English Document](README_EN.md) + ## 介绍 fastllm是纯c++实现,无第三方依赖的多平台高性能大模型推理库 diff --git a/README_EN.md b/README_EN.md index d413e502..e13202c2 100644 --- a/README_EN.md +++ b/README_EN.md @@ -2,258 +2,146 @@ ## Introduction -fastllm is a high-performance large model inference library implemented in pure C++ with no third-party dependencies. - -6~7 billion parameter models can run smoothly on Android devices. +fastllm is a high-performance large model inference library implemented purely in C++ with no third-party dependencies, supporting multiple platforms. Deployment and communication QQ group: 831641348 -| [Quick Start](#quick-start) | [Model Acquisition](#model-acquisition) | [Development Plan](#development-plan) | +| [Quick Start](#quick-start) | [Model Acquisition](#model-acquisition) | -## Overview of Features +## Features Overview -- 🚀 Pure C++ implementation, easy to port across platforms, can be compiled directly on Android. -- 🚀 ARM platform supports NEON instruction set acceleration, X86 platform supports AVX instruction set acceleration, NVIDIA platform supports CUDA acceleration, and all platforms are very fast. -- 🚀 Supports floating-point models (FP32), half-precision models (FP16), and quantized models (INT8, INT4) for acceleration. -- 🚀 Supports multi-card deployment, supports GPU + CPU hybrid deployment. -- 🚀 Supports batch speed optimization. -- 🚀 Supports dynamic batch stitching during concurrent computation. -- 🚀 Supports streaming output, convenient for implementing typewriter effects. -- 🚀 Supports Python invocation. -- 🚀 Front-end and back-end separation design, easy to support new computing devices. -- 🚀 Currently supports ChatGLM series models, various LLAMA models (ALPACA, VICUNA, etc.), BAICHUAN models, QWEN models, MOSS models, MINICPM models, etc. +- 🚀 Pure C++ implementation, facilitating cross-platform移植, directly compilable on Android +- 🚀 Supports reading Hugging Face raw models and direct quantization +- 🚀 Supports deploying OpenAI API server +- 🚀 Supports multi-card deployment, supports GPU + CPU hybrid deployment +- 🚀 Supports dynamic batching, streaming output +- 🚀 Front-end and back-end separation design, easy to support new computing devices +- 🚀 Currently supports ChatGLM series models, Qwen2 series models, various LLAMA models (ALPACA, VICUNA, etc.), BAICHUAN models, MOSS models, MINICPM models, etc. -## Two lines of code to accelerate (under testing, currently only supports ChatGLM series) +## Quick Start -Use the following command to install the fastllm_pytools package: +### Compilation -```sh -cd fastllm -mkdir build -cd build -cmake .. -DUSE_CUDA=ON # If not compiling with GPU, then use cmake .. -DUSE_CUDA=OFF -make -j -cd tools && python setup.py install -``` +It is recommended to use cmake for compilation, requiring pre-installed gcc, g++ (recommended 9.4 or above), make, cmake (recommended 3.23 or above). -To utilize fastllm for acceleration in your original inference program, you simply need to add two lines of code. +GPU compilation requires a pre-installed CUDA compilation environment, using the latest CUDA version is recommended. -```python -# This is the original program, creating the model through the huggingface interface -from transformers import AutoTokenizer, AutoModel -tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True) -model = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True) +Compile using the following commands: -# Add the following two lines to convert the huggingface model to the fastllm model -# Currently, the from_hf interface can only accept original models or ChatGLM's int4, int8 quantized models, and cannot convert other quantized models temporarily -from fastllm_pytools import llm -model = llm.from_hf(model, tokenizer, dtype="float16") # dtype supports "float16", "int8", "int4" - -# Comment out this line model.eval() -#model = model.eval() +``` sh +bash install.sh -DUSE_CUDA=ON # Compile GPU version +# bash install.sh -DUSE_CUDA=ON -DCUDA_ARCH=89 # Specify CUDA architecture, e.g., 4090 uses architecture 89 +# bash install.sh # Compile CPU version only ``` -The model now supports the ChatGLM API functions chat and stream_chat, so the ChatGLM demo program can run without needing any other code modifications. - -The model also supports the following APIs for generating replies. -```python -# Generate a response -print(model.response("Hello")) - -# Streaming response generation -for response in model.stream_response("Hello"): - print(response, flush=True, end="") -``` +For compilation on other platforms, refer to the documentation: +[TFACC Platform](docs/tfacc.md) -```python -model.save("model.flm") # Export the fastllm model -new_model = llm.model("model.flm") # Import the fastllm model -``` +### Running the demo program (python) -Note: This feature is in the testing phase, and currently, only ChatGLM and ChatGLM2 models have been verified to be accelerated with two lines of code. +Assuming our model is located in the "/mnt/hfmodels/Qwen/Qwen2-7B-Instruct/" directory: -## PEFT Support (In Testing, Currently Only Supports ChatGLM + LoRA) +After compilation, you can use the following demos: -Using 🤗PEFT, you can easily run fine-tuned large models. You can use the following method to accelerate your PEFT model with fastllm: +``` sh +# Use a model with float16 precision for conversation +python3 -m ftllm.chat -t 16 -p ~/Qwen2-7B-Instruct/ -```python -import sys -from peft import PeftModel -from transformers import AutoModel, AutoTokenizer -sys.path.append('..') -model = AutoModel.from_pretrained("THUDM/chatglm-6b", device_map='cpu', trust_remote_code=True) -model = PeftModel.from_pretrained(model, "path/to/your/own/adapter") # Use your own PEFT adapter here -model = model.eval() -tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True) +# Online quantization to int8 model for conversation +python3 -m ftllm.chat -t 16 -p ~/Qwen2-7B-Instruct/ --dtype int8 -# If there is an active_adapter in the model, it will also be enabled by default in the fastllm model -from fastllm_pytools import llm -model = llm.from_hf(model, tokenizer, dtype="float16") # dtype supports "float16", "int8", "int4" +# OpenAI API server (currently in testing and tuning phase) +# Requires dependencies: pip install -r requirements-server.txt +# Opens a server named 'qwen' on port 8080 +python3 -m ftllm.server -t 16 -p ~/Qwen2-7B-Instruct/ --port 8080 --model_name qwen ``` -Next, you can use the model just like a regular model (e.g., by calling the chat and stream_chat functions). +Detailed parameters can be viewed using the --help argument for all demos. -You can also change the adapter used by the PEFT model: +Current model support can be found at: [Model List](docs/models.md) -```python -model.set_adapter('your adapter name') -``` +For architectures that cannot directly read Hugging Face models, refer to [Model Conversion Documentation](docs/convert_model.md) to convert models to fastllm format. -Or disable PEFT to use the original pre-trained model: +### Running the demo program (c++) -```python -model.disable_adapter() ``` +# Enter the fastllm/build-fastllm directory -## Inference Speed - -6B-level int4 model has a minimum latency of about 5.5ms on a single 4090. - -6B-level fp16 model achieves a maximum throughput of over 10000 tokens/s on a single 4090. - -The speed of the 6B-level int4 model on Snapdragon 865 is approximately 4~5 tokens/s. - -[For detailed benchmark data points, click here.](docs/benchmark.md) - -## CMMLU Accuracy Test - -| Model | Data Accuracy | CMMLU Score | -|-----------------: |-------- |------------| -| ChatGLM2-6b-fp16 | float32 | 50.16 | -| ChatGLM2-6b-int8 | float32 | 50.14 | -| ChatGLM2-6b-int4 | float32 | 49.63 | - -Currently, ChatGLM2 model has been tested. For specific testing steps, please click [here](test/cmmlu/README.md). - -## Quick Start - -### Compile - - -It is recommended to compile using cmake, and you'll need to have a C++ compiler, make, and cmake installed beforehand. - -For gcc, version 9.4 or higher is recommended, and for cmake, version 3.23 or higher is recommended. - -For GPU compilation, ensure that you have the CUDA compilation environment installed, and it's recommended to use the latest possible CUDA version. - -To compile, use the following command: - -```sh -cd fastllm -mkdir build -cd build -cmake .. -DUSE_CUDA=ON # If not compiling with GPU, use cmake .. -DUSE_CUDA=OFF -make -j -``` - -After compiling, you can install the simple Python toolkit with the following command. - -```sh -cd tools # Now you are in the fastllm/build/tools directory -python setup.py install - -``` - -The compilation for different platforms can be referenced in the documentation. -[TFACC Platform](docs/tfacc.md) - -### Run Demo Program. - -Assuming you have obtained the model named `model.flm` (refer to [Model Acquisition](#model-acquisition)) - -After compilation, you can use the following demo in the build directory: - -```sh -# Now you are in the fastllm/build directory - -# Command-line chat program, supports typewriter effect (Linux only) +# Command line chat program, supports typewriter effect (Linux only) ./main -p model.flm -# Simple web UI, using streaming output + dynamic batch, can handle multiple concurrent accesses +# Simple webui, uses streaming output + dynamic batch, supports concurrent access ./webui -p model.flm --port 1234 - -# Python version of the command-line chat program, using model creation and streaming conversation effects -python tools/cli_demo.py -p model.flm - -# Python version of the simple web UI, you need to install streamlit-chat first -streamlit run tools/web_demo.py model.flm ``` -For compiling on Windows, it's recommended to use CMake GUI + Visual Studio. You can complete the process in a graphical interface. - -If you encounter any issues during compilation, especially on Windows, you can refer to the documentation for troubleshooting. [FAQ](docs/faq.md) +Compilation on Windows is recommended using Cmake GUI + Visual Studio, completed in the graphical interface. -### Simple Python Commands +For compilation issues, especially on Windows, refer to [FAQ](docs/faq.md). -If you have installed the simple Python toolkit after compilation, you can use Python to call some basic APIs. (If you haven't installed it, you can still use 'import' to directly use the compiled 'tools/fastllm_pytools') +### Python API -```python +``` python # Model creation -from fastllm_pytools import llm +from ftllm import llm model = llm.model("model.flm") # Generate response -print(model.response("Hello")) +print(model.response("你好")) -# Stream response generation -for response in model.stream_response("Hello"): - print(response, flush=True, end="") +# Stream generate response +for response in model.stream_response("你好"): + print(response, flush = True, end = "") ``` -Additionally, you can set the number of CPU threads and other parameters. For detailed API documentation, see [fastllm_pytools](docs/fastllm_pytools.md). +Additional settings such as CPU thread count can be found in the detailed API documentation: [ftllm](docs/ftllm.md) -This package does not include low-level APIs. If you need more advanced features, please refer to [Python Binding API](#Python-Binding-API). +This package does not include low-level APIs. For deeper functionalities, refer to [Python Binding API](#Python-binding-API). -## Python Binding API +## Multi-Card Deployment -``` -cd pyfastllm -export USE_CUDA=OFF # Use CPU only; remove this line to use GPU -python3 setup.py build -python3 setup.py install -cd examples/ -python cli_simple.py -m chatglm -p chatglm-6b-int8.flm -# or -python web_api.py -m chatglm -p chatglm-6b-int8.flm -``` -You can test the above web API using web_api_client.py. For more usage details, see the documentation [API Documents](pyfastllm/README.md). +### Using Multi-Card Deployment in Python Command Line Calls -## Multi-GPU deployment +``` sh +# Use the --device parameter to set multi-card calls +#--device cuda:1 # Set single device +#--device "['cuda:0', 'cuda:1']" # Deploy model evenly across multiple devices +#--device "{'cuda:0': 10, 'cuda:1': 5, 'cpu': 1} # Deploy model proportionally across multiple devices +``` -### Using multi-GPU deployment in fastllm_pytools +### Using Multi-Card Deployment in ftllm ``` python -from fastllm_pytools import llm - -# Support the following three methods, need to be called before model creation -llm.set_device_map("cuda:0") # Deploy the model on a single device -llm.set_device_map(["cuda:0", "cuda:1"]) # Deploy the model evenly across multiple devices -llm.set_device_map({"cuda:0": 10, "cuda:1": 5, "cpu": 1}) # Deploy the model on multiple devices with different ratios +from ftllm import llm +# Supports the following three methods, must be called before model creation +llm.set_device_map("cuda:0") # Deploy model on a single device +llm.set_device_map(["cuda:0", "cuda:1"]) # Deploy model evenly across multiple devices +llm.set_device_map({"cuda:0" : 10, "cuda:1" : 5, "cpu": 1}) # Deploy model proportionally across multiple devices ``` -### Using multi-GPU deployment in the Python Binding API. +### Using Multi-Card Deployment in Python Binding API + ``` python import pyfastllm as llm - -# Support the following method, needs to be called before model creation -llm.set_device_map({"cuda:0": 10, "cuda:1": 5, "cpu": 1}) # Deploy the model on multiple devices with different ratios +# Supports the following method, must be called before model creation +llm.set_device_map({"cuda:0" : 10, "cuda:1" : 5, "cpu": 1}) # Deploy model proportionally across multiple devices ``` -### Using multi-GPU deployment in C++. + +### Using Multi-Card Deployment in c++ ``` cpp -// Support the following method, needs to be called before model creation -fastllm::SetDeviceMap({{"cuda:0", 10}, {"cuda:1", 5}, {"cpu", 1}}); // Deploy the model on multiple devices with different ratios +// Supports the following method, must be called before model creation +fastllm::SetDeviceMap({{"cuda:0", 10}, {"cuda:1", 5}, {"cpu", 1}}); // Deploy model proportionally across multiple devices ``` -## Compiling and running with Docker -Docker runtime requires NVIDIA runtime to be installed locally and default runtime needs to be changed to nvidia. +## Docker Compilation and Running +Running docker requires the local installation of NVIDIA Runtime and modification of the default runtime to nvidia. -1. Installing nvidia-container-runtime +1. Install nvidia-container-runtime ``` sudo apt-get install nvidia-container-runtime ``` -2. Change the Docker default runtime to Nvidia. +2. Modify docker default runtime to nvidia /etc/docker/daemon.json ``` @@ -268,158 +156,41 @@ sudo apt-get install nvidia-container-runtime "runtimeArgs": [] } }, - "default-runtime": "nvidia" // Only this line is needed + "default-runtime": "nvidia" // This line is required } + ``` -3. Download the pre-trained model to the models directory. +3. Download the converted models to the models directory ``` models chatglm2-6b-fp16.flm chatglm2-6b-int8.flm ``` -4. Compile and start the web UI. +4. Compile and start webui ``` DOCKER_BUILDKIT=0 docker compose up -d --build ``` -## Android上使用 +## Usage on Android -### Compile -```sh -# Compiling on a PC requires downloading the NDK tool. -# Alternatively, you can try compiling on a mobile device. In Termux, you can use cmake and gcc (no need for NDK). +### Compilation +``` sh +# Compilation on PC requires downloading NDK tools +# You can also try compiling on the phone, using cmake and gcc in termux (no need for NDK) mkdir build-android cd build-android export NDK= -# If the mobile device does not support it, remove "-DCMAKE_CXX_FLAGS=-march=armv8.2a+dotprod" (most newer phones support it). +# If the phone does not support, remove "-DCMAKE_CXX_FLAGS=-march=armv8.2a+dotprod" (most new phones support this) cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_CXX_FLAGS=-march=armv8.2a+dotprod .. make -j ``` -### Run - -1. Install the Termux app on your Android device. -2. Run 'termux-setup-storage' in Termux to grant permission to access phone files. -3. Transfer the main file compiled with NDK, as well as the model file, to your phone and copy them to the root directory of Termux. -4. Use the command chmod 777 main to grant permissions. -5. You can then run the main file. For parameter format, see ./main --help. - -## Model Acquisition - -### Model Repository - -You can download pre-converted models from the following link. - -[huggingface](https://huggingface.co/huangyuyang) - -### Model export - -#### Exporting the ChatGLM model (default script exports the ChatGLM2-6b model). - -``` sh -# ChatGLM-6B environment needs to be installed first. -# If using a fine-tuned model, modify the code in chatglm_export.py file to create tokenizer and model. -cd build -python3 tools/chatglm_export.py chatglm2-6b-fp16.flm float16 # Export float16 model -python3 tools/chatglm_export.py chatglm2-6b-int8.flm int8 # Export int8 model -python3 tools/chatglm_export.py chatglm2-6b-int4.flm int4 # Export int4 model -``` - -#### Exporting the Baichuan model (default script exports the baichuan-13b-chat model). - -``` sh -# Baichuan environment needs to be installed first. -# If using a fine-tuned model, modify the code in baichuan2flm.py file to create tokenizer and model. -# Export the corresponding model based on the required precision. -cd build -python3 tools/baichuan2flm.py baichuan-13b-fp16.flm float16 # Export float16 model -python3 tools/baichuan2flm.py baichuan-13b-int8.flm int8 # Export int8 model -python3 tools/baichuan2flm.py baichuan-13b-int4.flm int4 # Export int4 model -``` - -#### Exporting the Baichuan2 model (default script exports the baichuan2-7b-chat model). - -``` sh -# Baichuan2 environment needs to be installed first. -# If using a fine-tuned model, modify the code in baichuan2_2flm.py file to create tokenizer and model. -# Export the corresponding model based on the required precision. -cd build -python3 tools/baichuan2_2flm.py baichuan2-7b-fp16.flm float16 # Export float16 model -python3 tools/baichuan2_2flm.py baichuan2-7b-int8.flm int8 # Export int8 model -python3 tools/baichuan2_2flm.py baichuan2-7b-int4.flm int4 # Export int4 model -``` - -#### Exporting the MOSS model - -``` sh -# MOSS environment needs to be installed first. -# If using a fine-tuned model, modify the code in moss_export.py file to create tokenizer and model. -# Export the corresponding model based on the required precision. -cd build -python3 tools/moss_export.py moss-fp16.flm float16 # Export float16 model -python3 tools/moss_export.py moss-int8.flm int8 # Export int8 model -python3 tools/moss_export.py moss-int4.flm int4 # Export int4 model -``` - -#### Exporting LLAMA series models -``` sh -# Modify the build/tools/alpaca2flm.py program for exporting. -# The commands used for different LLAMA models vary greatly, so you need to configure them according to the parameters in torch2flm.py. -``` -Some models' conversions can be referenced in the examples [here](docs/llama_cookbook.md). - -#### Exporting the QWEN model -* **Qwen** -``` sh -# QWEN environment needs to be installed first. -# If using a fine-tuned model, modify the code in qwen2flm.py file to create tokenizer and model. -# Export the corresponding model based on the required precision. -cd build -python3 tools/qwen2flm.py qwen-7b-fp16.flm float16 # Export float16 model -python3 tools/qwen2flm.py qwen-7b-int8.flm int8 # Export int8 model -python3 tools/qwen2flm.py qwen-7b-int4.flm int4 # Export int4 model -``` - -* **Qwen1.5** -``` sh -# QWen2 environment needs to be installed first (transformers >= 4.37.0). -# Export the corresponding model based on the required precision. -cd build -python3 tools/llamalike2flm.py qwen1.5-7b-fp16.flm float16 "qwen/Qwen1.5-4B-Chat" # Export qwen1.5-4B-Chat float16 model -python3 tools/llamalike2flm.py qwen1.5-7b-int8.flm int8 "qwen/Qwen1.5-7B-Chat" # Export Qwen1.5-7B-Chat int8 model -python3 tools/llamalike2flm.py qwen1.5-7b-int4.flm int4 "qwen/Qwen1.5-14B-Chat" # Export Qwen1.5-14B-Chat int4 model -# The last parameter can be replaced with the model path. -``` - -#### Exporting the MINICPM model - -```sh -# MINICPM environment needs to be installed first (transformers >= 4.36.0). -# The default script exports the iniCPM-2B-dpo-fp16 model. -cd build -python tools/minicpm2flm.py minicpm-2b-float16.flm # Export dpo-float16 model -./main -p minicpm-2b-float16.flm # Execute the model -``` - -## Development Plan - -If you have any features you need, feel free to bring them up in the discussion area. - -### Short-term plan - -- Add MMLU, CMMLU, and other test programs. -- Support direct conversion of pre-quantized Hugging Face models. -- Implement extrapolation to 8K length. - -### Mid-term Plan - -- Support more backends, such as OpenCL, Vulkan, and some NPU acceleration devices. -- Support and validate more models, improve the model repository. -- Optimize the tokenizer (since currently the original model's tokenizer can be directly used for tokenization in Python, this task is not urgent for now). - -### Long-term Plan +### Running -- Support ONNX model import and inference. -- Support model fine-tuning. +1. Install the termux app on the Android device. +2. Execute termux-setup-storage in termux to gain permission to read phone files. +3. Copy the main file and model file compiled with NDK into the phone and into the termux root directory. +4. Use the command ```chmod 777 main``` to grant permissions. +5. Run the main file, refer to ```./main --help``` for parameter format. \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md index 17472d5f..40d1e024 100755 --- a/docs/faq.md +++ b/docs/faq.md @@ -108,7 +108,7 @@ GPU编译时,根据使用的CUDA版本,将cudart cublas的相关dll文件复 * %CUDA_PATH%\bin\cublas64_12.dll * %CUDA_PATH%\bin\cublasLt64_12.dll -## fastllm_pytools +## ftllm ### 释放内存报错: CUDA error when release memory diff --git a/example/openai_server/README.md b/example/openai_server/README.md index 45ddfa53..d1b4347c 100644 --- a/example/openai_server/README.md +++ b/example/openai_server/README.md @@ -16,7 +16,7 @@ ## 依赖 以下依赖在python 3.12.2上没有问题 -1. 需要先安装fastllm_pytools工具包 +1. 需要先安装ftllm工具包 2. 需要安装以下依赖 ```bash cd example/openai_server diff --git a/example/openai_server/fastllm_completion.py b/example/openai_server/fastllm_completion.py index 7b1099f4..395f2bd7 100644 --- a/example/openai_server/fastllm_completion.py +++ b/example/openai_server/fastllm_completion.py @@ -11,7 +11,7 @@ ChatCompletionRole) from protocal.openai_protocol import * -from fastllm_pytools import llm +from ftllm import llm class ConversationMessage: def __init__(self, role:str, content:str): diff --git a/test/basic/forward_check.py b/test/basic/forward_check.py index 8062499b..8b2e4276 100644 --- a/test/basic/forward_check.py +++ b/test/basic/forward_check.py @@ -7,7 +7,7 @@ import torch import tqdm from transformers import AutoModelForCausalLM, AutoTokenizer -from fastllm_pytools import llm +from ftllm import llm def args_parser(): parser = argparse.ArgumentParser(description = 'fastllm_test') diff --git a/test/basic/tokenizer_check.py b/test/basic/tokenizer_check.py index 9694ed72..37fd5e61 100644 --- a/test/basic/tokenizer_check.py +++ b/test/basic/tokenizer_check.py @@ -4,7 +4,7 @@ import logging import os from transformers import AutoTokenizer -from fastllm_pytools import llm +from ftllm import llm def args_parser(): parser = argparse.ArgumentParser(description = 'fastllm_test') diff --git a/test/cmmlu/baichuan.py b/test/cmmlu/baichuan.py index e3cc88ee..f8496526 100644 --- a/test/cmmlu/baichuan.py +++ b/test/cmmlu/baichuan.py @@ -72,7 +72,7 @@ def eval(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, c torch_dtype=torch.float16, ) - from fastllm_pytools import llm; + from ftllm import llm; model = llm.from_hf(model, tokenizer, dtype = args.dtype); model.direct_query = True; diff --git a/test/cmmlu/chatglm.py b/test/cmmlu/chatglm.py index bd3b5872..92782937 100644 --- a/test/cmmlu/chatglm.py +++ b/test/cmmlu/chatglm.py @@ -63,7 +63,7 @@ def eval_chat_multithread(model, tokenizer, subject, dev_df, test_df, num_few_sh tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True,) model = AutoModel.from_pretrained(args.model_name_or_path, trust_remote_code=True).cpu() - from fastllm_pytools import llm; + from ftllm import llm; model = llm.from_hf(model, tokenizer, dtype = args.dtype); # model.save("/root/test.flm"); diff --git a/test/cmmlu/qwen.py b/test/cmmlu/qwen.py index 44007c7c..fe06d302 100644 --- a/test/cmmlu/qwen.py +++ b/test/cmmlu/qwen.py @@ -77,7 +77,7 @@ def eval_chat_multithread(model, tokenizer, subject, dev_df, test_df, num_few_sh torch_dtype=torch.float16, ) - from fastllm_pytools import llm; + from ftllm import llm; model = llm.from_hf(model, tokenizer, dtype = args.dtype) model.direct_query = True diff --git a/tools/fastllm_pytools/hf_model.py b/tools/fastllm_pytools/hf_model.py index 37bbcfb8..8761dfee 100644 --- a/tools/fastllm_pytools/hf_model.py +++ b/tools/fastllm_pytools/hf_model.py @@ -1,4 +1,4 @@ -from fastllm_pytools import llm; +from ftllm import llm; import ctypes; import builtins, os, json import numpy as np diff --git a/tools/fastllm_pytools/llm.py b/tools/fastllm_pytools/llm.py index 87460132..2fac19cc 100644 --- a/tools/fastllm_pytools/llm.py +++ b/tools/fastllm_pytools/llm.py @@ -127,7 +127,7 @@ def from_hf(model, bot_role = None, history_sep = None, dtype = "float16"): - from fastllm_pytools import hf_model; + from ftllm import hf_model; return hf_model.create(model, tokenizer, pre_prompt = pre_prompt, user_role = user_role, bot_role = bot_role, history_sep = history_sep, dtype = dtype); diff --git a/tools/scripts/alpaca2flm.py b/tools/scripts/alpaca2flm.py index a0e12fbd..cd3b0cba 100644 --- a/tools/scripts/alpaca2flm.py +++ b/tools/scripts/alpaca2flm.py @@ -1,7 +1,7 @@ import sys import torch from transformers import AutoTokenizer, LlamaForCausalLM -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": model_name = sys.argv[3] if len(sys.argv) >= 4 else 'minlik/chinese-alpaca-33b-merged' diff --git a/tools/scripts/baichuan2_2flm.py b/tools/scripts/baichuan2_2flm.py index ff3b2fed..39feea60 100644 --- a/tools/scripts/baichuan2_2flm.py +++ b/tools/scripts/baichuan2_2flm.py @@ -2,7 +2,7 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.utils import GenerationConfig -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": modelpath = "baichuan-inc/Baichuan2-7B-Chat" diff --git a/tools/scripts/baichuan2flm.py b/tools/scripts/baichuan2flm.py index bcfea7c3..f60620cb 100644 --- a/tools/scripts/baichuan2flm.py +++ b/tools/scripts/baichuan2flm.py @@ -2,7 +2,7 @@ import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.utils import GenerationConfig -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": modelpath = "baichuan-inc/baichuan-13B-Chat" diff --git a/tools/scripts/bert2flm.py b/tools/scripts/bert2flm.py index 55b53094..f3ce6d5e 100644 --- a/tools/scripts/bert2flm.py +++ b/tools/scripts/bert2flm.py @@ -1,6 +1,6 @@ import sys from transformers import AutoTokenizer, AutoModel -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": modelpath = sys.argv[3] if len(sys.argv) >= 4 else 'BAAI/bge-small-zh-v1.5' diff --git a/tools/scripts/chatglm_export.py b/tools/scripts/chatglm_export.py index 8cc4fb69..2be62d37 100644 --- a/tools/scripts/chatglm_export.py +++ b/tools/scripts/chatglm_export.py @@ -1,6 +1,6 @@ import sys from transformers import AutoTokenizer, AutoModel -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True) diff --git a/tools/scripts/cli_demo.py b/tools/scripts/cli_demo.py index 1aa5dcbe..7faec34a 100644 --- a/tools/scripts/cli_demo.py +++ b/tools/scripts/cli_demo.py @@ -1,5 +1,5 @@ import argparse -from fastllm_pytools import llm +from ftllm import llm import readline def args_parser(): diff --git a/tools/scripts/glm_export.py b/tools/scripts/glm_export.py index 284e3c0d..93656287 100644 --- a/tools/scripts/glm_export.py +++ b/tools/scripts/glm_export.py @@ -4,7 +4,7 @@ import torch import binascii from transformers import AutoTokenizer, AutoModel -from fastllm_pytools import torch2flm +from ftllm import torch2flm def glmtofile(exportPath, model, diff --git a/tools/scripts/llama3_to_flm.py b/tools/scripts/llama3_to_flm.py index 1d864c88..241273c3 100644 --- a/tools/scripts/llama3_to_flm.py +++ b/tools/scripts/llama3_to_flm.py @@ -1,7 +1,7 @@ import sys import torch from transformers import AutoTokenizer, AutoModelForCausalLM -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else 'meta-llama/Meta-Llama-3-8B' diff --git a/tools/scripts/llamalike2flm.py b/tools/scripts/llamalike2flm.py index 9249b1e6..7a73a36e 100644 --- a/tools/scripts/llamalike2flm.py +++ b/tools/scripts/llamalike2flm.py @@ -1,7 +1,7 @@ import sys import torch from transformers import AutoTokenizer, AutoModelForCausalLM -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else 'qwen/Qwen1.5-7B-Chat' diff --git a/tools/scripts/minicpm2flm.py b/tools/scripts/minicpm2flm.py index e4343556..ae08a2ec 100644 --- a/tools/scripts/minicpm2flm.py +++ b/tools/scripts/minicpm2flm.py @@ -1,7 +1,7 @@ import sys import torch from transformers import AutoTokenizer, AutoModelForCausalLM -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": modelNameOrPath = sys.argv[3] if len(sys.argv) >= 4 else "openbmb/MiniCPM-2B-dpo-fp16" diff --git a/tools/scripts/moss_export.py b/tools/scripts/moss_export.py index 9553d238..0a960774 100644 --- a/tools/scripts/moss_export.py +++ b/tools/scripts/moss_export.py @@ -1,6 +1,6 @@ import sys from transformers import AutoTokenizer, AutoModelForCausalLM -from fastllm_pytools import torch2flm +from ftllm import torch2flm tokenizer = AutoTokenizer.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True); model = AutoModelForCausalLM.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True).float(); diff --git a/tools/scripts/qwen2flm.py b/tools/scripts/qwen2flm.py index 1dde95d3..af54b640 100644 --- a/tools/scripts/qwen2flm.py +++ b/tools/scripts/qwen2flm.py @@ -1,7 +1,7 @@ import sys from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation import GenerationConfig -from fastllm_pytools import torch2flm +from ftllm import torch2flm if __name__ == "__main__": tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True) diff --git a/tools/scripts/web_demo.py b/tools/scripts/web_demo.py index 27f93b22..a8f78c73 100644 --- a/tools/scripts/web_demo.py +++ b/tools/scripts/web_demo.py @@ -1,6 +1,6 @@ import streamlit as st from streamlit_chat import message -from fastllm_pytools import llm +from ftllm import llm import sys st.set_page_config(