diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 8132cc6..56eeccb 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -16,13 +16,42 @@ on: jobs: build_wheels: - name: ${{ matrix.type }} ${{ matrix.arch }} on ${{ matrix.os }} + name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} - ${{ matrix.p_ver }} runs-on: ${{ matrix.os }} + env: + CIBW_BUILD: ${{ matrix.cibw_build }} + CIBW_ARCHS_LINUX: ${{ matrix.arch }} + CIBW_ARCHS_MACOS: ${{ matrix.arch }} strategy: - fail-fast: false matrix: - os: [macos-latest, ubuntu-latest] + os: [ubuntu-latest, windows-latest, macos-latest] arch: [auto64] + cibw_build: ["cp3{9,10,11}-*"] + p_ver: ["3.9-3.11"] + exclude: + - arch: arm64 + os: macos-latest + include: + - arch: aarch64 + os: ubuntu-latest + cibw_build: "cp37*" + p_ver: "3.7" + - arch: aarch64 + os: ubuntu-latest + cibw_build: "cp38*" + p_ver: "3.8" + - arch: aarch64 + os: ubuntu-latest + cibw_build: "cp39*" + p_ver: "3.9" + - arch: aarch64 + os: ubuntu-latest + cibw_build: "cp310*" + p_ver: "3.10" + - arch: aarch64 + os: ubuntu-latest + cibw_build: "cp311*" + p_ver: "3.11" steps: - uses: actions/checkout@v3 @@ -30,20 +59,16 @@ jobs: fetch-depth: 0 submodules: true - # Used to host cibuildwheel - - uses: actions/setup-python@v3 - # Installs poetry - - uses: Gr1N/setup-poetry@v8 + - name: Set up QEMU + if: matrix.os == 'ubuntu-latest' && matrix.arch == 'aarch64' + uses: docker/setup-qemu-action@v1 with: - poetry-version: "1.4.0" - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==2.12.1 + platforms: arm64 - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse + uses: pypa/cibuildwheel@v2.12.1 env: - CIBW_ARCHS_MACOS: "x86_64 universal2 arm64" - CIBW_TEST_SKIP: '*_arm64 *_universal2:arm64' + CIBW_ARCHS_MACOS: "x86_64" - uses: actions/upload-artifact@v3 with: diff --git a/README.md b/README.md index 12b9532..36797ac 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ llamacpp-quantize ./models/7B/ llamacpp-cli ``` +**Note that running `llamacpp-convert` requires `torch`, `sentencepiece` and `numpy` to be installed. These packages are not installed by default when your install `llamacpp`.** + ## Command line interface The package installs the command line entry point `llamacpp-cli` that points to `llamacpp/cli.py` and should provide about the same functionality as the `main` program in the original C++ repository. There is also an experimental `llamacpp-chat` that is supposed to bring up a chat interface but this is not working correctly yet. @@ -40,6 +42,41 @@ The package installs the command line entry point `llamacpp-cli` that points to See `llamacpp/cli.py` for a detailed example. The simplest demo would be something like the following: +```python + +params = llamacpp.gpt_params( + './models/7B/ggml_model_q4_0.bin', # model, + "A llama is a ", # prompt + "", # reverse_prompt + 512, # ctx_size + 100, # n_predict + 40, # top_k + 0.95, # top_p + 0.85, # temp + 1.30, # repeat_penalty + -1, # seed + 8, # threads + 64, # repeat_last_n + 8, # batch_size + False, # color + False, # interactive or args.interactive_start + False, # interactive_start +) +model = llamacpp.PyLLAMA(params) +model.add_bos() # Adds "beginning of string" token +model.update_input(params.prompt) +model.print_startup_stats() +model.prepare_context() + +model.ingest_all_pending_input(True) +while not model.is_finished(): + model.ingest_all_pending_input(not input_noecho) + text, is_finished = model.infer_text() + print(text, end="") +if is_finished: + break +``` + ## ToDo - [x] Use poetry to build package diff --git a/build.py b/build.py index 59969bc..4a3bc00 100644 --- a/build.py +++ b/build.py @@ -1,3 +1,4 @@ +import os from setuptools_cpp import CMakeExtension, ExtensionBuilder from typing import Any, Dict @@ -12,5 +13,13 @@ def build(setup_kwargs: Dict[str, Any]) -> None: "ext_modules": ext_modules, "cmdclass": dict(build_ext=ExtensionBuilder), "zip_safe": False, + "options": { + 'bdist_wheel': { + 'plat_name': os.getenv('PP_PYTHON_TARGET', 'any') + }, + 'egg_info': { + 'egg_base': './build/' + } + } } ) diff --git a/llamacpp/convert.py b/llamacpp/convert.py index e9a5d42..a575f70 100644 --- a/llamacpp/convert.py +++ b/llamacpp/convert.py @@ -17,13 +17,21 @@ # and vocabulary. # +# Check if torch is installed and show and error and exit if not import sys import json import struct -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor +try: + import torch + import numpy as np + from sentencepiece import SentencePieceProcessor +except ImportError: + print("Error: torch, sentencepiece and numpy are required to run this script.") + print("Please install using the following command:") + print(" pip install torch sentencepiece numpy") + sys.exit(1) + def main(): if len(sys.argv) < 3: @@ -35,7 +43,7 @@ def main(): # output in the same directory as the model dir_model = sys.argv[1] - fname_hparams = sys.argv[1] + "/params.json" + fname_hparams = sys.argv[1] + "/params.json" fname_tokenizer = sys.argv[1] + "/../tokenizer.model" def get_n_parts(dim): @@ -76,15 +84,15 @@ def get_n_parts(dim): n_parts = get_n_parts(hparams["dim"]) print(hparams) - print('n_parts = ', n_parts) + print("n_parts = ", n_parts) for p in range(n_parts): - print('Processing part ', p) + print("Processing part ", p) - #fname_model = sys.argv[1] + "/consolidated.00.pth" + # fname_model = sys.argv[1] + "/consolidated.00.pth" fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth" fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" - if (p > 0): + if p > 0: fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p) # weights_only requires torch 1.13.1, remove this param or update if you get an "invalid keyword argument" error @@ -92,19 +100,19 @@ def get_n_parts(dim): fout = open(fname_out, "wb") - fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex + fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex fout.write(struct.pack("i", hparams["vocab_size"])) fout.write(struct.pack("i", hparams["dim"])) fout.write(struct.pack("i", hparams["multiple_of"])) fout.write(struct.pack("i", hparams["n_heads"])) fout.write(struct.pack("i", hparams["n_layers"])) - fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete) + fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete) fout.write(struct.pack("i", ftype)) # Is this correct?? for i in range(32000): # TODO: this is probably wrong - not sure how this tokenizer works - text = tokenizer.decode([29889, i]).encode('utf-8') + text = tokenizer.decode([29889, i]).encode("utf-8") # remove the first byte (it's always '.') text = text[1:] fout.write(struct.pack("i", len(text))) @@ -120,16 +128,16 @@ def get_n_parts(dim): print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype) - #data = tf.train.load_variable(dir_model, name).squeeze() + # data = tf.train.load_variable(dir_model, name).squeeze() data = v.numpy().squeeze() - n_dims = len(data.shape); + n_dims = len(data.shape) # for efficiency - transpose some matrices # "model/h.*/attn/c_attn/w" # "model/h.*/attn/c_proj/w" # "model/h.*/mlp/c_fc/w" # "model/h.*/mlp/c_proj/w" - #if name[-14:] == "/attn/c_attn/w" or \ + # if name[-14:] == "/attn/c_attn/w" or \ # name[-14:] == "/attn/c_proj/w" or \ # name[-11:] == "/mlp/c_fc/w" or \ # name[-13:] == "/mlp/c_proj/w": @@ -146,11 +154,11 @@ def get_n_parts(dim): ftype_cur = 0 # header - sname = name.encode('utf-8') + sname = name.encode("utf-8") fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur)) for i in range(n_dims): fout.write(struct.pack("i", dshape[n_dims - 1 - i])) - fout.write(sname); + fout.write(sname) # data data.tofile(fout) @@ -163,5 +171,6 @@ def get_n_parts(dim): print("Done. Output file: " + fname_out + ", (part ", p, ")") print("") -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/pyproject.toml b/pyproject.toml index 2111a8c..7c13464 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llamacpp" -version = "0.1.5" +version = "0.1.6" description = "Python bindings for @ggerganov's llama.cpp" authors = ["Thomas Antony "] license = "MIT" @@ -11,8 +11,12 @@ packages = [{ include = "llamacpp", from = "." }] [tool.poetry.dependencies] python = "^3.6" + +[tool.poetry.group.dev.dependencies] +# Require torch and sentencepiece for running the convert script torch = "^1.13.1" sentencepiece = "^0.1.97" +setuptools-cpp = "^0.1.0" [build-system] requires = ["poetry>=0.12", "setuptools", "wheel", "setuptools-cpp"] @@ -29,9 +33,11 @@ llamacpp-cli = 'llamacpp.cli:run' llamacpp-chat = 'llamacpp.chat:run' [tool.cibuildwheel] +# Install pybind and poetry +before-build = "pip install -U \"pybind11[global]\" poetry" -# Install something required for the build -# (you might want to use build-system.requires instead) -before-build = "pip install -U \"pybind11[global]\"" # Skip PyPy and 32-bit builds skip = ["pp*", "*-win32", "*-manylinux_i686", "*-musllinux_i686"] + +build-verbosity = 3 +test-skip = "*macosx*arm64*" diff --git a/src/PyLlama.cpp b/src/PyLlama.cpp index 799a3a0..e105d18 100644 --- a/src/PyLlama.cpp +++ b/src/PyLlama.cpp @@ -2,17 +2,8 @@ #include "llama.h" #include "utils.h" #include -#include - -void catch_signals() { - auto handler = [](int code) { throw std::runtime_error("SIGNAL " + std::to_string(code)); }; - signal(SIGINT, handler); - signal(SIGTERM, handler); - signal(SIGKILL, handler); -} - namespace py = pybind11; class PyLLAMA {