[Inference] Fix auth token and add models starcoder and llama2 (intel#39

) * add starcoder and enable llama2 * nit * nit * revert * add token * dedup * add token to from_pretrained * pass auth token to from_pretrained * nit * add auth tokens * lint * fix lint * nit * deepspeed not support starcoder * nit * remove from ci * remove direct auth token * add back ci workflow temporarily * remove from ci * add load environment and enable 2 models again * add dir * add load environment and enable 2 models again * change proxy * revert proxy * change proxy * revert proxy * remove 2 models from ci --------- Signed-off-by: Yizhong Zhang <[email protected]>
JoshuaL3000 · Feb 7, 2024 · 6d72097 · 6d72097
1 parent 8baef62
commit 6d72097
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 7 deletions.
diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -34,7 +34,7 @@ jobs:
     name: inference test
     strategy:
       matrix:
-        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, llama-2-7b-chat-hf-vllm ]
+        model: [ gpt-j-6b, gpt2, bloom-560m, opt-125m, mpt-7b, mistral-7b-v0.1, mpt-7b-bigdl, neural-chat-7b-v3-1, CodeLlama-7b-hf, falcon-7b, starcoder, llama-2-7b-chat-hf, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
@@ -61,11 +61,15 @@ jobs:
         https_proxy: ${{ inputs.https_proxy }}
       volumes:
         - /var/run/docker.sock:/var/run/docker.sock
+        - ${{ inputs.runner_config_path }}:/root/actions-runner-config
 
     steps:
       - name: Checkout
         uses: actions/checkout@v2
 
+      - name: Load environment variables
+        run: cat /root/actions-runner-config/.env >> $GITHUB_ENV 
+
       - name: Determine Target
         id: "target"
         run: |
@@ -111,6 +115,25 @@ jobs:
       - name: Run Inference Test
         run: |
           TARGET=${{steps.target.outputs.target}}
+          CMD=$(cat << EOF
+          import yaml
+          if ("${{ matrix.model }}" == "starcoder"):
+              conf_path = "inference/models/starcoder.yaml"
+              with open(conf_path, encoding="utf-8") as reader:
+                  result = yaml.load(reader, Loader=yaml.FullLoader)
+                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+              with open(conf_path, 'w') as output:
+                  yaml.dump(result, output, sort_keys=False)
+          if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
+              conf_path = "inference/models/llama-2-7b-chat-hf.yaml"
+              with open(conf_path, encoding="utf-8") as reader:
+                  result = yaml.load(reader, Loader=yaml.FullLoader)
+                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
+              with open(conf_path, 'w') as output:
+                  yaml.dump(result, output, sort_keys=False)
+          EOF
+          )
+          docker exec "${TARGET}" python -c "$CMD"
           if [[ ${{ matrix.model }} == "mpt-7b-bigdl" ]]; then
             docker exec "${TARGET}" bash -c "python inference/serve.py --config_file inference/models/bigdl/mpt-7b-bigdl.yaml --simple"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
@@ -134,7 +157,7 @@ jobs:
       - name: Run Inference Test with DeepSpeed
         run: |
           TARGET=${{steps.target.outputs.target}}
-          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|mpt-7b.*)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           elif [[ ! ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             docker exec "${TARGET}" bash -c "python .github/workflows/config/update_inference_config.py --config_file inference/models/\"${{ matrix.model }}\".yaml --output_file \"${{ matrix.model }}\".yaml.deepspeed --deepspeed"
@@ -147,7 +170,7 @@ jobs:
         if: ${{ matrix.dtuner_model }}
         run: |
           TARGET=${{steps.target.outputs.target}}
-          if [[ ${{ matrix.model }} =~ ^(gpt2|mpt-7b.*)$ ]]; then
+          if [[ ${{ matrix.model }} =~ ^(gpt2|falcon-7b|starcoder|mpt-7b.*)$ ]]; then
             echo ${{ matrix.model }} is not supported!
           else
             docker exec "${TARGET}" bash -c "python inference/serve.py --config_file .github/workflows/config/mpt_deltatuner_deepspeed.yaml --simple"

diff --git a/inference/deepspeed_predictor.py b/inference/deepspeed_predictor.py
@@ -35,6 +35,7 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
+            use_auth_token=infer_conf.model_description.config.use_auth_token,
         )
 
         # get correct torch type for loading HF model
@@ -50,7 +51,11 @@ def __init__(self, infer_conf: InferenceConfig, pad_token_id, stopping_criteria)
         if model_desc.peft_model_id_or_path:
             from peft import PeftModel
 
-            self.model = PeftModel.from_pretrained(self.model, model_desc.peft_model_id_or_path)
+            self.model = PeftModel.from_pretrained(
+                self.model,
+                model_desc.peft_model_id_or_path,
+                use_auth_token=infer_conf.model_description.config.use_auth_token,
+            )
             if model_desc.peft_type == "deltatuner":
                 from deltatuner import DeltaTunerModel
 

diff --git a/inference/models/llama-2-7b-chat-hf.yaml b/inference/models/llama-2-7b-chat-hf.yaml
@@ -7,7 +7,7 @@ deepspeed: false
 workers_per_group: 2
 device: "cpu"
 ipex:
-  enabled: true
+  enabled: false
   precision: bf16
 model_description:
   model_id_or_path: meta-llama/Llama-2-7b-chat-hf

diff --git a/inference/models/starcoder.yaml b/inference/models/starcoder.yaml
@@ -0,0 +1,22 @@
+port: 8000
+name: starcoder
+route_prefix: /starcoder
+cpus_per_worker: 24
+gpus_per_worker: 0
+deepspeed: false
+workers_per_group: 2
+ipex:
+  enabled: false
+  precision: bf16
+device: "cpu"
+model_description:  
+  model_id_or_path: bigcode/starcoder
+  tokenizer_name_or_path: bigcode/starcoder
+  chat_processor: ChatModelGptJ
+  prompt:
+    intro: ''
+    human_id: ''
+    bot_id: ''
+    stop_words: []
+  config:
+    use_auth_token: ''
diff --git a/inference/predictor.py b/inference/predictor.py
@@ -10,7 +10,8 @@ class Predictor:
     def __init__(self, infer_conf: InferenceConfig) -> None:
         self.infer_conf = infer_conf
         self.tokenizer = AutoTokenizer.from_pretrained(
-            infer_conf.model_description.tokenizer_name_or_path
+            infer_conf.model_description.tokenizer_name_or_path,
+            **infer_conf.model_description.config.dict(),
         )
         self.device = torch.device(infer_conf.device)
         # now deepspeed predictor don't have the model

diff --git a/inference/transformer_predictor.py b/inference/transformer_predictor.py
@@ -15,6 +15,7 @@ def __init__(self, infer_conf: InferenceConfig):
             model_desc.model_id_or_path,
             torchscript=True,
             trust_remote_code=model_config.trust_remote_code,
+            use_auth_token=infer_conf.model_description.config.use_auth_token,
         )
 
         if self.device.type == "hpu":
@@ -52,7 +53,11 @@ def __init__(self, infer_conf: InferenceConfig):
         if model_desc.peft_model_id_or_path:
             from peft import PeftModel
 
-            model = PeftModel.from_pretrained(model, model_desc.peft_model_id_or_path)
+            model = PeftModel.from_pretrained(
+                model,
+                model_desc.peft_model_id_or_path,
+                use_auth_token=infer_conf.model_description.config.use_auth_token,
+            )
             if model_desc.peft_type == "deltatuner":
                 from deltatuner import DeltaTunerModel