Skip to content

Commit

Permalink
Merge pull request #1 from thomasantony/make-torch-optional
Browse files Browse the repository at this point in the history
Make torch an optional dependency and fix the CI workflows
  • Loading branch information
thomasantony authored Mar 19, 2023
2 parents b03974b + dcc54f2 commit 8851626
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 43 deletions.
51 changes: 38 additions & 13 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,59 @@ on:

jobs:
build_wheels:
name: ${{ matrix.type }} ${{ matrix.arch }} on ${{ matrix.os }}
name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} - ${{ matrix.p_ver }}
runs-on: ${{ matrix.os }}
env:
CIBW_BUILD: ${{ matrix.cibw_build }}
CIBW_ARCHS_LINUX: ${{ matrix.arch }}
CIBW_ARCHS_MACOS: ${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
os: [macos-latest, ubuntu-latest]
os: [ubuntu-latest, windows-latest, macos-latest]
arch: [auto64]
cibw_build: ["cp3{9,10,11}-*"]
p_ver: ["3.9-3.11"]
exclude:
- arch: arm64
os: macos-latest
include:
- arch: aarch64
os: ubuntu-latest
cibw_build: "cp37*"
p_ver: "3.7"
- arch: aarch64
os: ubuntu-latest
cibw_build: "cp38*"
p_ver: "3.8"
- arch: aarch64
os: ubuntu-latest
cibw_build: "cp39*"
p_ver: "3.9"
- arch: aarch64
os: ubuntu-latest
cibw_build: "cp310*"
p_ver: "3.10"
- arch: aarch64
os: ubuntu-latest
cibw_build: "cp311*"
p_ver: "3.11"

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
submodules: true

# Used to host cibuildwheel
- uses: actions/setup-python@v3
# Installs poetry
- uses: Gr1N/setup-poetry@v8
- name: Set up QEMU
if: matrix.os == 'ubuntu-latest' && matrix.arch == 'aarch64'
uses: docker/setup-qemu-action@v1
with:
poetry-version: "1.4.0"
- name: Install cibuildwheel
run: python -m pip install cibuildwheel==2.12.1
platforms: arm64

- name: Build wheels
run: python -m cibuildwheel --output-dir wheelhouse
uses: pypa/cibuildwheel@v2.12.1
env:
CIBW_ARCHS_MACOS: "x86_64 universal2 arm64"
CIBW_TEST_SKIP: '*_arm64 *_universal2:arm64'
CIBW_ARCHS_MACOS: "x86_64"

- uses: actions/upload-artifact@v3
with:
Expand Down
37 changes: 37 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ llamacpp-quantize ./models/7B/
llamacpp-cli
```

**Note that running `llamacpp-convert` requires `torch`, `sentencepiece` and `numpy` to be installed. These packages are not installed by default when your install `llamacpp`.**

## Command line interface

The package installs the command line entry point `llamacpp-cli` that points to `llamacpp/cli.py` and should provide about the same functionality as the `main` program in the original C++ repository. There is also an experimental `llamacpp-chat` that is supposed to bring up a chat interface but this is not working correctly yet.
Expand All @@ -40,6 +42,41 @@ The package installs the command line entry point `llamacpp-cli` that points to

See `llamacpp/cli.py` for a detailed example. The simplest demo would be something like the following:

```python

params = llamacpp.gpt_params(
'./models/7B/ggml_model_q4_0.bin', # model,
"A llama is a ", # prompt
"", # reverse_prompt
512, # ctx_size
100, # n_predict
40, # top_k
0.95, # top_p
0.85, # temp
1.30, # repeat_penalty
-1, # seed
8, # threads
64, # repeat_last_n
8, # batch_size
False, # color
False, # interactive or args.interactive_start
False, # interactive_start
)
model = llamacpp.PyLLAMA(params)
model.add_bos() # Adds "beginning of string" token
model.update_input(params.prompt)
model.print_startup_stats()
model.prepare_context()

model.ingest_all_pending_input(True)
while not model.is_finished():
model.ingest_all_pending_input(not input_noecho)
text, is_finished = model.infer_text()
print(text, end="")
if is_finished:
break
```

## ToDo

- [x] Use poetry to build package
Expand Down
9 changes: 9 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from setuptools_cpp import CMakeExtension, ExtensionBuilder
from typing import Any, Dict

Expand All @@ -12,5 +13,13 @@ def build(setup_kwargs: Dict[str, Any]) -> None:
"ext_modules": ext_modules,
"cmdclass": dict(build_ext=ExtensionBuilder),
"zip_safe": False,
"options": {
'bdist_wheel': {
'plat_name': os.getenv('PP_PYTHON_TARGET', 'any')
},
'egg_info': {
'egg_base': './build/'
}
}
}
)
43 changes: 26 additions & 17 deletions llamacpp/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,21 @@
# and vocabulary.
#

# Check if torch is installed and show and error and exit if not
import sys
import json
import struct
import numpy as np
import torch

from sentencepiece import SentencePieceProcessor
try:
import torch
import numpy as np
from sentencepiece import SentencePieceProcessor
except ImportError:
print("Error: torch, sentencepiece and numpy are required to run this script.")
print("Please install using the following command:")
print(" pip install torch sentencepiece numpy")
sys.exit(1)


def main():
if len(sys.argv) < 3:
Expand All @@ -35,7 +43,7 @@ def main():
# output in the same directory as the model
dir_model = sys.argv[1]

fname_hparams = sys.argv[1] + "/params.json"
fname_hparams = sys.argv[1] + "/params.json"
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"

def get_n_parts(dim):
Expand Down Expand Up @@ -76,35 +84,35 @@ def get_n_parts(dim):
n_parts = get_n_parts(hparams["dim"])

print(hparams)
print('n_parts = ', n_parts)
print("n_parts = ", n_parts)

for p in range(n_parts):
print('Processing part ', p)
print("Processing part ", p)

#fname_model = sys.argv[1] + "/consolidated.00.pth"
# fname_model = sys.argv[1] + "/consolidated.00.pth"
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
if (p > 0):
if p > 0:
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)

# weights_only requires torch 1.13.1, remove this param or update if you get an "invalid keyword argument" error
model = torch.load(fname_model, map_location="cpu", weights_only=True)

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", 0x67676D6C)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["dim"]))
fout.write(struct.pack("i", hparams["multiple_of"]))
fout.write(struct.pack("i", hparams["n_heads"]))
fout.write(struct.pack("i", hparams["n_layers"]))
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
fout.write(struct.pack("i", ftype))

# Is this correct??
for i in range(32000):
# TODO: this is probably wrong - not sure how this tokenizer works
text = tokenizer.decode([29889, i]).encode('utf-8')
text = tokenizer.decode([29889, i]).encode("utf-8")
# remove the first byte (it's always '.')
text = text[1:]
fout.write(struct.pack("i", len(text)))
Expand All @@ -120,16 +128,16 @@ def get_n_parts(dim):

print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)

#data = tf.train.load_variable(dir_model, name).squeeze()
# data = tf.train.load_variable(dir_model, name).squeeze()
data = v.numpy().squeeze()
n_dims = len(data.shape);
n_dims = len(data.shape)

# for efficiency - transpose some matrices
# "model/h.*/attn/c_attn/w"
# "model/h.*/attn/c_proj/w"
# "model/h.*/mlp/c_fc/w"
# "model/h.*/mlp/c_proj/w"
#if name[-14:] == "/attn/c_attn/w" or \
# if name[-14:] == "/attn/c_attn/w" or \
# name[-14:] == "/attn/c_proj/w" or \
# name[-11:] == "/mlp/c_fc/w" or \
# name[-13:] == "/mlp/c_proj/w":
Expand All @@ -146,11 +154,11 @@ def get_n_parts(dim):
ftype_cur = 0

# header
sname = name.encode('utf-8')
sname = name.encode("utf-8")
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for i in range(n_dims):
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
fout.write(sname);
fout.write(sname)

# data
data.tofile(fout)
Expand All @@ -163,5 +171,6 @@ def get_n_parts(dim):
print("Done. Output file: " + fname_out + ", (part ", p, ")")
print("")

if __name__ == '__main__':

if __name__ == "__main__":
main()
14 changes: 10 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "llamacpp"
version = "0.1.5"
version = "0.1.6"
description = "Python bindings for @ggerganov's llama.cpp"
authors = ["Thomas Antony <[email protected]>"]
license = "MIT"
Expand All @@ -11,8 +11,12 @@ packages = [{ include = "llamacpp", from = "." }]

[tool.poetry.dependencies]
python = "^3.6"

[tool.poetry.group.dev.dependencies]
# Require torch and sentencepiece for running the convert script
torch = "^1.13.1"
sentencepiece = "^0.1.97"
setuptools-cpp = "^0.1.0"

[build-system]
requires = ["poetry>=0.12", "setuptools", "wheel", "setuptools-cpp"]
Expand All @@ -29,9 +33,11 @@ llamacpp-cli = 'llamacpp.cli:run'
llamacpp-chat = 'llamacpp.chat:run'

[tool.cibuildwheel]
# Install pybind and poetry
before-build = "pip install -U \"pybind11[global]\" poetry"

# Install something required for the build
# (you might want to use build-system.requires instead)
before-build = "pip install -U \"pybind11[global]\""
# Skip PyPy and 32-bit builds
skip = ["pp*", "*-win32", "*-manylinux_i686", "*-musllinux_i686"]

build-verbosity = 3
test-skip = "*macosx*arm64*"
9 changes: 0 additions & 9 deletions src/PyLlama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,8 @@
#include "llama.h"
#include "utils.h"
#include <pybind11/pybind11.h>
#include <csignal>



void catch_signals() {
auto handler = [](int code) { throw std::runtime_error("SIGNAL " + std::to_string(code)); };
signal(SIGINT, handler);
signal(SIGTERM, handler);
signal(SIGKILL, handler);
}

namespace py = pybind11;

class PyLLAMA {
Expand Down

0 comments on commit 8851626

Please sign in to comment.