Docs #130

Workflow file for this run

.github/workflows/compile-gguf.yml at f2197f7

	name: Compile main using GGUF

	on:
	push:
	branches:
	- main
	pull_request:
	workflow_dispatch:

	jobs:
	run-tinystories:
	strategy:
	matrix:
	runner: [macos-14]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout repo
	uses: actions/checkout@v2
	- name: Setup Python
	uses: actions/setup-python@v2
	with:
	python-version: 3.11
	- name: Print machine info
	run: \|
	uname -a
	if [ $(uname -s) == Darwin ]; then
	sysctl machdep.cpu.brand_string
	sysctl machdep.cpu.core_count
	fi
	- name: Install requirements
	run: \|
	pip install gguf
	pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
	pip install -r requirements.txt
	- name: Download GGUF
	run: \|
	mkdir gguf_files
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export TOKENIZER_PATH=gguf_files/tokenizer.model

	wget -O ${GGUF_PATH} "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true"
	wget -O ${TOKENIZER_PATH} https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
	- name: Run inference
	run: \|
	export GGUF_PATH=gguf_files/TinyLlama-1.1B-openorca.Q4_0.gguf
	export TOKENIZER_PATH=gguf_files/tokenizer.model
	export MODEL_NAME=TinyLlama-1.1B-openorca.Q4_0.gguf
	export MODEL_DIR=/tmp

	echo "******************************************"
	echo "***** Embed: not quantized ***********"
	echo "******************************************"

	echo "Running eager"
	python generate.py --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
	cat ./output_eager

	echo "Running compiled"
	python generate.py --compile --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
	cat ./output_compiled

	echo "******************************************"
	echo "***** Emb: channel-wise quantized ****"
	echo "******************************************"

	echo "Running eager"
	python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
	cat ./output_eager

	echo "Running compiled"
	python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 0}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
	cat ./output_compiled

	echo "******************************************"
	echo "****** Emb: group-wise quantized *****"
	echo "******************************************"

	echo "Running eager"
	python generate.py --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_eager
	cat ./output_eager

	echo "Running compiled"
	python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "groupsize": 8}}' --gguf-path ${GGUF_PATH} --tokenizer-path ${TOKENIZER_PATH} --max-new-tokens 20 --temperature 0 > ./output_compiled
	cat ./output_compiled

	echo "tests complete"
	echo "******************************************"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Docs #130

Workflow file

Docs #130

Jobs

Run details

Workflow file for this run