diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 8132cc6..56eeccb 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -16,13 +16,42 @@ on:
 
 jobs:
   build_wheels:
-    name: ${{ matrix.type }} ${{ matrix.arch }} on ${{ matrix.os }}
+    name: Build wheels on ${{ matrix.os }} for ${{ matrix.arch }} - ${{ matrix.p_ver }}
     runs-on: ${{ matrix.os }}
+    env:
+      CIBW_BUILD: ${{ matrix.cibw_build }}
+      CIBW_ARCHS_LINUX: ${{ matrix.arch }}
+      CIBW_ARCHS_MACOS: ${{ matrix.arch }}
     strategy:
-      fail-fast: false
       matrix:
-        os: [macos-latest, ubuntu-latest]
+        os: [ubuntu-latest, windows-latest, macos-latest]
         arch: [auto64]
+        cibw_build: ["cp3{9,10,11}-*"]
+        p_ver: ["3.9-3.11"]
+        exclude:
+          - arch: arm64
+            os: macos-latest
+        include:
+          - arch: aarch64
+            os: ubuntu-latest
+            cibw_build: "cp37*"
+            p_ver: "3.7"
+          - arch: aarch64
+            os: ubuntu-latest
+            cibw_build: "cp38*"
+            p_ver: "3.8"
+          - arch: aarch64
+            os: ubuntu-latest
+            cibw_build: "cp39*"
+            p_ver: "3.9"
+          - arch: aarch64
+            os: ubuntu-latest
+            cibw_build: "cp310*"
+            p_ver: "3.10"
+          - arch: aarch64
+            os: ubuntu-latest
+            cibw_build: "cp311*"
+            p_ver: "3.11"
 
     steps:
       - uses: actions/checkout@v3
@@ -30,20 +59,16 @@ jobs:
           fetch-depth: 0
           submodules: true
 
-      # Used to host cibuildwheel
-      - uses: actions/setup-python@v3
-      # Installs poetry
-      - uses: Gr1N/setup-poetry@v8
+      - name: Set up QEMU
+        if: matrix.os == 'ubuntu-latest' && matrix.arch == 'aarch64'
+        uses: docker/setup-qemu-action@v1
         with:
-          poetry-version: "1.4.0"
-      - name: Install cibuildwheel
-        run: python -m pip install cibuildwheel==2.12.1
+          platforms: arm64
 
       - name: Build wheels
-        run: python -m cibuildwheel --output-dir wheelhouse
+        uses: pypa/cibuildwheel@v2.12.1
         env:
-          CIBW_ARCHS_MACOS: "x86_64 universal2 arm64"
-          CIBW_TEST_SKIP: '*_arm64 *_universal2:arm64'
+          CIBW_ARCHS_MACOS: "x86_64"
 
       - uses: actions/upload-artifact@v3
         with:
diff --git a/README.md b/README.md
index 12b9532..36797ac 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,8 @@ llamacpp-quantize ./models/7B/
 llamacpp-cli
 ```
 
+**Note that running `llamacpp-convert` requires `torch`, `sentencepiece` and `numpy` to be installed. These packages are not installed by default when your install `llamacpp`.**
+
 ## Command line interface
 
 The package installs the command line entry point `llamacpp-cli` that points to `llamacpp/cli.py` and should provide about the same functionality as the `main` program in the original C++ repository. There is also an experimental `llamacpp-chat` that is supposed to bring up a chat interface but this is not working correctly yet.
@@ -40,6 +42,41 @@ The package installs the command line entry point `llamacpp-cli` that points to
 
 See `llamacpp/cli.py` for a detailed example. The simplest demo would be something like the following:
 
+```python
+
+params = llamacpp.gpt_params(
+	'./models/7B/ggml_model_q4_0.bin', # model,
+	"A llama is a ", # prompt
+	"", # reverse_prompt
+	512, # ctx_size
+	100, # n_predict
+	40, # top_k
+	0.95, # top_p
+	0.85, # temp
+	1.30, # repeat_penalty
+	-1, # seed
+	8, # threads
+	64, # repeat_last_n
+	8, # batch_size
+	False, # color
+	False, # interactive or args.interactive_start
+	False, # interactive_start
+)
+model = llamacpp.PyLLAMA(params)
+model.add_bos()		# Adds "beginning of string" token
+model.update_input(params.prompt)
+model.print_startup_stats()
+model.prepare_context()
+
+model.ingest_all_pending_input(True)
+while not model.is_finished():
+	model.ingest_all_pending_input(not input_noecho)
+	text, is_finished = model.infer_text()
+	print(text, end="")
+if is_finished:
+	break
+```
+
 ## ToDo
 
 - [x] Use poetry to build package
diff --git a/build.py b/build.py
index 59969bc..4a3bc00 100644
--- a/build.py
+++ b/build.py
@@ -1,3 +1,4 @@
+import os
 from setuptools_cpp import CMakeExtension, ExtensionBuilder
 from typing import Any, Dict
 
@@ -12,5 +13,13 @@ def build(setup_kwargs: Dict[str, Any]) -> None:
             "ext_modules": ext_modules,
             "cmdclass": dict(build_ext=ExtensionBuilder),
             "zip_safe": False,
+            "options": {
+                'bdist_wheel': {
+                    'plat_name': os.getenv('PP_PYTHON_TARGET', 'any')
+                },
+                'egg_info': {
+                    'egg_base': './build/'
+                }
+            }
         }
     )
diff --git a/llamacpp/convert.py b/llamacpp/convert.py
index e9a5d42..a575f70 100644
--- a/llamacpp/convert.py
+++ b/llamacpp/convert.py
@@ -17,13 +17,21 @@
 # and vocabulary.
 #
 
+# Check if torch is installed and show and error and exit if not
 import sys
 import json
 import struct
-import numpy as np
-import torch
 
-from sentencepiece import SentencePieceProcessor
+try:
+    import torch
+    import numpy as np
+    from sentencepiece import SentencePieceProcessor
+except ImportError:
+    print("Error: torch, sentencepiece and numpy are required to run this script.")
+    print("Please install using the following command:")
+    print("  pip install torch sentencepiece numpy")
+    sys.exit(1)
+
 
 def main():
     if len(sys.argv) < 3:
@@ -35,7 +43,7 @@ def main():
     # output in the same directory as the model
     dir_model = sys.argv[1]
 
-    fname_hparams   = sys.argv[1] + "/params.json"
+    fname_hparams = sys.argv[1] + "/params.json"
     fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
 
     def get_n_parts(dim):
@@ -76,15 +84,15 @@ def get_n_parts(dim):
     n_parts = get_n_parts(hparams["dim"])
 
     print(hparams)
-    print('n_parts = ', n_parts)
+    print("n_parts = ", n_parts)
 
     for p in range(n_parts):
-        print('Processing part ', p)
+        print("Processing part ", p)
 
-        #fname_model = sys.argv[1] + "/consolidated.00.pth"
+        # fname_model = sys.argv[1] + "/consolidated.00.pth"
         fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
         fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
-        if (p > 0):
+        if p > 0:
             fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
 
         # weights_only requires torch 1.13.1, remove this param or update if you get an "invalid keyword argument" error
@@ -92,19 +100,19 @@ def get_n_parts(dim):
 
         fout = open(fname_out, "wb")
 
-        fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+        fout.write(struct.pack("i", 0x67676D6C))  # magic: ggml in hex
         fout.write(struct.pack("i", hparams["vocab_size"]))
         fout.write(struct.pack("i", hparams["dim"]))
         fout.write(struct.pack("i", hparams["multiple_of"]))
         fout.write(struct.pack("i", hparams["n_heads"]))
         fout.write(struct.pack("i", hparams["n_layers"]))
-        fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
+        fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"]))  # rot (obsolete)
         fout.write(struct.pack("i", ftype))
 
         # Is this correct??
         for i in range(32000):
             # TODO: this is probably wrong - not sure how this tokenizer works
-            text = tokenizer.decode([29889, i]).encode('utf-8')
+            text = tokenizer.decode([29889, i]).encode("utf-8")
             # remove the first byte (it's always '.')
             text = text[1:]
             fout.write(struct.pack("i", len(text)))
@@ -120,16 +128,16 @@ def get_n_parts(dim):
 
             print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
 
-            #data = tf.train.load_variable(dir_model, name).squeeze()
+            # data = tf.train.load_variable(dir_model, name).squeeze()
             data = v.numpy().squeeze()
-            n_dims = len(data.shape);
+            n_dims = len(data.shape)
 
             # for efficiency - transpose some matrices
             # "model/h.*/attn/c_attn/w"
             # "model/h.*/attn/c_proj/w"
             # "model/h.*/mlp/c_fc/w"
             # "model/h.*/mlp/c_proj/w"
-            #if name[-14:] == "/attn/c_attn/w" or \
+            # if name[-14:] == "/attn/c_attn/w" or \
             #   name[-14:] == "/attn/c_proj/w" or \
             #   name[-11:] == "/mlp/c_fc/w" or \
             #   name[-13:] == "/mlp/c_proj/w":
@@ -146,11 +154,11 @@ def get_n_parts(dim):
                 ftype_cur = 0
 
             # header
-            sname = name.encode('utf-8')
+            sname = name.encode("utf-8")
             fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
             for i in range(n_dims):
                 fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
-            fout.write(sname);
+            fout.write(sname)
 
             # data
             data.tofile(fout)
@@ -163,5 +171,6 @@ def get_n_parts(dim):
         print("Done. Output file: " + fname_out + ", (part ", p, ")")
         print("")
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()
diff --git a/pyproject.toml b/pyproject.toml
index 2111a8c..7c13464 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llamacpp"
-version = "0.1.5"
+version = "0.1.6"
 description = "Python bindings for @ggerganov's llama.cpp"
 authors = ["Thomas Antony <mail@thomasantony.com>"]
 license = "MIT"
@@ -11,8 +11,12 @@ packages = [{ include = "llamacpp", from = "." }]
 
 [tool.poetry.dependencies]
 python = "^3.6"
+
+[tool.poetry.group.dev.dependencies]
+# Require torch and sentencepiece for running the convert script
 torch = "^1.13.1"
 sentencepiece = "^0.1.97"
+setuptools-cpp = "^0.1.0"
 
 [build-system]
 requires = ["poetry>=0.12", "setuptools", "wheel", "setuptools-cpp"]
@@ -29,9 +33,11 @@ llamacpp-cli = 'llamacpp.cli:run'
 llamacpp-chat = 'llamacpp.chat:run'
 
 [tool.cibuildwheel]
+# Install pybind and poetry
+before-build = "pip install -U \"pybind11[global]\" poetry"
 
-# Install something required for the build
-# (you might want to use build-system.requires instead)
-before-build = "pip install -U \"pybind11[global]\""
 # Skip PyPy and 32-bit builds
 skip = ["pp*", "*-win32", "*-manylinux_i686", "*-musllinux_i686"]
+
+build-verbosity = 3
+test-skip = "*macosx*arm64*"
diff --git a/src/PyLlama.cpp b/src/PyLlama.cpp
index 799a3a0..e105d18 100644
--- a/src/PyLlama.cpp
+++ b/src/PyLlama.cpp
@@ -2,17 +2,8 @@
 #include "llama.h"
 #include "utils.h"
 #include <pybind11/pybind11.h>
-#include <csignal>
 
 
-
-void catch_signals() {
-  auto handler = [](int code) { throw std::runtime_error("SIGNAL " + std::to_string(code)); };
-  signal(SIGINT, handler);
-  signal(SIGTERM, handler);
-  signal(SIGKILL, handler);
-}
-
 namespace py = pybind11;
 
 class PyLLAMA {