Skip to content

Commit

Permalink
[Inference] Fix auth token and add models starcoder and llama2 (intel#39
Browse files Browse the repository at this point in the history
)

* add starcoder and enable llama2

* nit

* nit

* revert

* add token

* dedup

* add token to from_pretrained

* pass auth token to from_pretrained

* nit

* add auth tokens

* lint

* fix lint

* nit

* deepspeed not support starcoder

* nit

* remove from ci

* remove direct auth token

* add back ci workflow temporarily

* remove from ci

* add load environment and enable 2 models again

* add dir

* add load environment and enable 2 models again

* change proxy

* revert proxy

* change proxy

* revert proxy

* remove 2 models from ci

---------

Signed-off-by: Yizhong Zhang <[email protected]>
  • Loading branch information
Deegue authored Feb 7, 2024
1 parent 8baef62 commit 6d72097
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 7 deletions.
29 changes: 26 additions & 3 deletions .github/workflows/workflow_inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
name: inference test
strategy:
matrix:
model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, llama-2-7b-chat-hf-vllm ]
model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm ]
isPR:
- ${{inputs.ci_type == 'pr'}}

Expand All @@ -61,11 +61,15 @@ jobs:
https_proxy: ${{ inputs.https_proxy }}
volumes:
- /var/run/docker.sock:/var/run/docker.sock
- ${{ inputs.runner_config_path }}:/root/actions-runner-config

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Load environment variables
run: cat /root/actions-runner-config/.env >> $GITHUB_ENV

- name: Determine Target
id: "target"
run: |
Expand Down Expand Up @@ -111,6 +115,25 @@ jobs:
- name: Run Inference Test
run: |
TARGET=${{steps.target.outputs.target}}
CMD=$(cat << EOF
import yaml
if ("${{ matrix.model }}" == "starcoder"):
conf_path = "inference/models/starcoder.yaml"
with open(conf_path, encoding="utf-8") as reader:
result = yaml.load(reader, Loader=yaml.FullLoader)
result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
with open(conf_path, 'w') as output:
yaml.dump(result, output, sort_keys=False)
if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
conf_path = "inference/models/llama-2-7b-chat-hf.yaml"
with open(conf_path, encoding="utf-8") as reader:
result = yaml.load(reader, Loader=yaml.FullLoader)
result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
with open(conf_path, 'w') as output:
yaml.dump(result, output, sort_keys=False)
EOF
)
docker exec "${TARGET}" python -c "$CMD"
if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
Expand All @@ -134,7 +157,7 @@ jobs:
- name: Run Inference Test with DeepSpeed
run: |
TARGET=${{steps.target.outputs.target}}
if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|mpt-7b.*)$ ]]; then
if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
echo ${{ matrix.model }} is not supported!
elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
Expand All @@ -147,7 +170,7 @@ jobs:
if: ${{ matrix.dtuner_model }}
run: |
TARGET=${{steps.target.outputs.target}}
if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then
if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
echo ${{ matrix.model }} is not supported!
else
docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"
Expand Down
7 changes: 6 additions & 1 deletion inference/deepspeed_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
model_desc.model_id_or_path,
torchscript=True,
trust_remote_code=model_config.trust_remote_code,
use_auth_token=infer_conf.model_description.config.use_auth_token,
)

# get correct torch type for loading HF model
Expand All @@ -50,7 +51,11 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
if model_desc.peft_model_id_or_path:
from peft import PeftModel

self.model = PeftModel.from_pretrained(self.model, model_desc.peft_model_id_or_path)
self.model = PeftModel.from_pretrained(
self.model,
model_desc.peft_model_id_or_path,
use_auth_token=infer_conf.model_description.config.use_auth_token,
)
if model_desc.peft_type == "deltatuner":
from deltatuner import DeltaTunerModel

Expand Down
2 changes: 1 addition & 1 deletion inference/models/llama-2-7b-chat-hf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ deepspeed: false
workers_per_group: 2
device: "cpu"
ipex:
enabled: true
enabled: false
precision: bf16
model_description:
model_id_or_path: meta-llama/Llama-2-7b-chat-hf
Expand Down
22 changes: 22 additions & 0 deletions inference/models/starcoder.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
port: 8000
name: starcoder
route_prefix: /starcoder
cpus_per_worker: 24
gpus_per_worker: 0
deepspeed: false
workers_per_group: 2
ipex:
enabled: false
precision: bf16
device: "cpu"
model_description:
model_id_or_path: bigcode/starcoder
tokenizer_name_or_path: bigcode/starcoder
chat_processor: ChatModelGptJ
prompt:
intro: ''
human_id: ''
bot_id: ''
stop_words: []
config:
use_auth_token: ''
3 changes: 2 additions & 1 deletion inference/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ class Predictor:
def __init__(self, infer_conf: InferenceConfig) -> None:
self.infer_conf = infer_conf
self.tokenizer = AutoTokenizer.from_pretrained(
infer_conf.model_description.tokenizer_name_or_path
infer_conf.model_description.tokenizer_name_or_path,
**infer_conf.model_description.config.dict(),
)
self.device = torch.device(infer_conf.device)
# now deepspeed predictor don't have the model
Expand Down
7 changes: 6 additions & 1 deletion inference/transformer_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(self, infer_conf: InferenceConfig):
model_desc.model_id_or_path,
torchscript=True,
trust_remote_code=model_config.trust_remote_code,
use_auth_token=infer_conf.model_description.config.use_auth_token,
)

if self.device.type == "hpu":
Expand Down Expand Up @@ -52,7 +53,11 @@ def __init__(self, infer_conf: InferenceConfig):
if model_desc.peft_model_id_or_path:
from peft import PeftModel

model = PeftModel.from_pretrained(model, model_desc.peft_model_id_or_path)
model = PeftModel.from_pretrained(
model,
model_desc.peft_model_id_or_path,
use_auth_token=infer_conf.model_description.config.use_auth_token,
)
if model_desc.peft_type == "deltatuner":
from deltatuner import DeltaTunerModel

Expand Down

0 comments on commit 6d72097

Please sign in to comment.