From 9344b1efcdfaa95945dee919eb1243490886a81e Mon Sep 17 00:00:00 2001 From: "n.mckillip" Date: Mon, 4 Mar 2024 20:11:02 +0000 Subject: [PATCH] supporting more device types, and unpacking model loading bug. --- runtimes/huggingface/README.md | 18 ----- .../mlserver_huggingface/codecs/base.py | 17 ---- runtimes/huggingface/tests/test_codecs.py | 80 +------------------ runtimes/huggingface/tests/test_common.py | 46 ----------- 4 files changed, 3 insertions(+), 158 deletions(-) diff --git a/runtimes/huggingface/README.md b/runtimes/huggingface/README.md index ba15a39ca..04db06ccb 100644 --- a/runtimes/huggingface/README.md +++ b/runtimes/huggingface/README.md @@ -66,24 +66,6 @@ Models in the HuggingFace hub can be loaded by specifying their name in `paramet If `parameters.extra.pretrained_model` is specified, it takes precedence over `parameters.uri`. ```` -#### Model Inference -Model inference is done by HuggingFace pipeline. It allows users to run inference on a batch of inputs. Extra inference kwargs can be kept in `parameters.extra`. -```{code-block} json -{ - "inputs": [ - { - "name": "text_inputs", - "shape": [1], - "datatype": "BYTES", - "data": ["My kitten's name is JoJo,","Tell me a story:"], - } - ], - "parameters": { - "extra":{"max_new_tokens": 200,"return_full_text": false} - } -} -``` - ### Reference You can find the full reference of the accepted extra settings for the diff --git a/runtimes/huggingface/mlserver_huggingface/codecs/base.py b/runtimes/huggingface/mlserver_huggingface/codecs/base.py index 3ea4a5be6..4fdfcbb08 100644 --- a/runtimes/huggingface/mlserver_huggingface/codecs/base.py +++ b/runtimes/huggingface/mlserver_huggingface/codecs/base.py @@ -1,4 +1,3 @@ -import logging from typing import Optional, Type, Any, Dict, List, Union, Sequence from mlserver.codecs.utils import ( has_decoded, @@ -171,10 +170,6 @@ def encode_request(cls, payload: Dict[str, Any], **kwargs) -> InferenceRequest: @classmethod def decode_request(cls, request: InferenceRequest) -> Dict[str, Any]: - """ - Decode Inference request into dictionary - extra Inference kwargs are extracted from 'InferenceRequest.parameters.extra' - """ values = {} field_codecs = cls._find_decode_codecs(request) for item in request.inputs: @@ -186,18 +181,6 @@ def decode_request(cls, request: InferenceRequest) -> Dict[str, Any]: value = get_decoded_or_raw(item) values[item.name] = value - - if request.parameters is not None: - if hasattr(request.parameters, "extra"): - extra = request.parameters.extra - if isinstance(extra, dict): - values.update(extra) - else: - logging.warn( - "Extra parameters is provided with " - + f"value '{extra}' and type '{type(extra)}' \n" - + "Extra parameters cannot be parsed, expected a dictionary." - ) return values diff --git a/runtimes/huggingface/tests/test_codecs.py b/runtimes/huggingface/tests/test_codecs.py index 4c3395ca8..0aead7663 100644 --- a/runtimes/huggingface/tests/test_codecs.py +++ b/runtimes/huggingface/tests/test_codecs.py @@ -1,5 +1,5 @@ import pytest -import logging + from mlserver.types import ( InferenceRequest, InferenceResponse, @@ -28,87 +28,13 @@ ] ), {"foo": ["bar1", "bar2"], "foo2": ["var1"]}, - ), - ( - InferenceRequest( - parameters=Parameters(content_type="str", extra={"foo3": "var2"}), - inputs=[ - RequestInput( - name="foo", - datatype="BYTES", - data=["bar1", "bar2"], - shape=[2, 1], - ), - RequestInput( - name="foo2", datatype="BYTES", data=["var1"], shape=[1, 1] - ), - ], - ), - {"foo": ["bar1", "bar2"], "foo2": ["var1"], "foo3": "var2"}, - ), + ) ], ) def test_decode_request(inference_request, expected): payload = HuggingfaceRequestCodec.decode_request(inference_request) - assert payload == expected - -@pytest.mark.parametrize( - "inference_request, expected_payload, expected_log_msg", - [ - ( - InferenceRequest( - parameters=Parameters(content_type="str", extra="foo3"), - inputs=[ - RequestInput( - name="foo", - datatype="BYTES", - data=["bar1", "bar2"], - shape=[2, 1], - ), - RequestInput( - name="foo2", datatype="BYTES", data=["var1"], shape=[1, 1] - ), - ], - ), - {"foo": ["bar1", "bar2"]}, - logging.warn( - "Extra parameters is provided with ", - +"value: 'foo3' and type ' \n", - +"Extra parameters cannot be parsed, expected a dictionary.", - ), - ), - ( - InferenceRequest( - parameters=Parameters(content_type="str", extra=1234), - inputs=[ - RequestInput( - name="foo", - datatype="BYTES", - data=["bar1", "bar2"], - shape=[2, 1], - ), - RequestInput( - name="foo2", datatype="BYTES", data=["var1"], shape=[1, 1] - ), - ], - ), - {"foo": ["bar1", "bar2"]}, - logging.warn( - "Extra parameters is provided with " - + "value '1234' and type ' \n", - +"Extra parameters cannot be parsed, expected a dictionary.", - ), - ), - ], -) -def test_decode_request_with_invalid_parameter_extra( - inference_request, expected_payload, expected_log_msg, caplog -): - caplog.set_level(logging.WARN) - payload = HuggingfaceRequestCodec.decode_request(inference_request) - assert payload == expected_payload - assert expected_log_msg in caplog.text + assert payload == expected @pytest.mark.parametrize( diff --git a/runtimes/huggingface/tests/test_common.py b/runtimes/huggingface/tests/test_common.py index a96ea8dd4..acad03026 100644 --- a/runtimes/huggingface/tests/test_common.py +++ b/runtimes/huggingface/tests/test_common.py @@ -250,49 +250,3 @@ def test_pipeline_checks_for_eos_and_pad_token( m = load_pipeline_from_settings(hf_settings, model_settings) assert m._batch_size == expected_batch_size - - -@pytest.mark.parametrize( - "inference_kwargs, expected_num_tokens", - [ - ({"max_new_tokens": 10, "return_full_text": False}, 10), - ({"max_new_tokens": 20, "return_full_text": False}, 20), - ], -) -async def test_pipeline_uses_inference_kwargs( - inference_kwargs: Optional[dict], - expected_num_tokens: int, -): - model_settings = ModelSettings( - name="foo", - implementation=HuggingFaceRuntime, - parameters=ModelParameters( - extra={ - "pretrained_model": "Maykeye/TinyLLama-v0", - "task": "text-generation", - } - ), - ) - runtime = HuggingFaceRuntime(model_settings) - runtime.ready = await runtime.load() - payload = InferenceRequest( - inputs=[ - RequestInput( - name="args", - shape=[1], - datatype="BYTES", - data=["This is a test"], - ) - ], - parameters=Parameters(extra=inference_kwargs), - ) - tokenizer = runtime._model.tokenizer - - prediction = await runtime.predict(payload) - decoded_prediction = MultiInputRequestCodec.decode_response(prediction) - if isinstance(decoded_prediction, dict): - generated_text = decoded_prediction["output"][0]["generated_text"] - assert isinstance(generated_text, str) - tokenized_generated_text = tokenizer.tokenize(generated_text) - num_predicted_tokens = len(tokenized_generated_text) - assert num_predicted_tokens == expected_num_tokens