diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp index 99bd158d9..6a9bfc48e 100644 --- a/src/models/captured_graph_pool.cpp +++ b/src/models/captured_graph_pool.cpp @@ -20,7 +20,7 @@ void CapturedGraphInfoRecycler::operator()(CapturedGraphInfo* captured_graph_inf } CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model, int max_batch_size) const { - if (model.use_cuda_graph_ && (model.device_type_ == DeviceType::CUDA || model.device_type_ == DeviceType::DML)) { + if (!model.use_cuda_graph_ || (model.device_type_ != DeviceType::CUDA && model.device_type_ != DeviceType::DML)) { return nullptr; } diff --git a/src/models/captured_graph_pool.h b/src/models/captured_graph_pool.h index 0e00351fa..aabafef0f 100644 --- a/src/models/captured_graph_pool.h +++ b/src/models/captured_graph_pool.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include diff --git a/test/python/test_onnxruntime_genai_api.py b/test/python/test_onnxruntime_genai_api.py index 66d217dae..3132e5c84 100644 --- a/test/python/test_onnxruntime_genai_api.py +++ b/test/python/test_onnxruntime_genai_api.py @@ -10,112 +10,6 @@ import onnxruntime_genai as og import pytest - -@pytest.mark.parametrize( - "relative_model_path", - ( - [ - Path("hf-internal-testing") / "tiny-random-gpt2-fp32", - Path("hf-internal-testing") / "tiny-random-gpt2-fp32-cuda", - Path("hf-internal-testing") / "tiny-random-gpt2-fp16-cuda", - ] - if og.is_cuda_available() - else [Path("hf-internal-testing") / "tiny-random-gpt2-fp32"] - ), -) -def test_greedy_search(test_data_path, relative_model_path): - model_path = os.fspath(Path(test_data_path) / relative_model_path) - - model = og.Model(model_path) - - search_params = og.GeneratorParams(model) - search_params.input_ids = np.array( - [[0, 0, 0, 52], [0, 0, 195, 731]], dtype=np.int32 - ) - search_params.set_search_options(do_sample=False, max_length=10) - input_ids_shape = [2, 4] - batch_size = input_ids_shape[0] - - generator = og.Generator(model, search_params) - while not generator.is_done(): - generator.compute_logits() - generator.generate_next_token() - - expected_sequence = np.array( - [ - [0, 0, 0, 52, 204, 204, 204, 204, 204, 204], - [0, 0, 195, 731, 731, 114, 114, 114, 114, 114], - ], - dtype=np.int32, - ) - for i in range(batch_size): - assert np.array_equal(expected_sequence[i], generator.get_sequence(i)) - - sequences = model.generate(search_params) - for i in range(len(sequences)): - assert sequences[i] == expected_sequence[i].tolist() - - -# TODO: CUDA pipelines use python3.6 and do not have a way to download models since downloading models -# requires pytorch and hf transformers. This test should be re-enabled once the pipeline is updated. -@pytest.mark.skipif( - sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8, - reason="Python 3.8 is required for downloading models.", -) -@pytest.mark.parametrize( - "device", ["cpu", "cuda"] if og.is_cuda_available() else ["cpu"] -) -@pytest.mark.parametrize("batch", [True, False]) -def test_tokenizer_encode_decode(device, phi2_for, batch): - model_path = phi2_for(device) - - model = og.Model(model_path) - tokenizer = og.Tokenizer(model) - - prompts = [ - "This is a test.", - "Rats are awesome pets!", - "The quick brown fox jumps over the lazy dog.", - ] - sequences = None - if batch: - sequences = tokenizer.encode_batch(prompts) - decoded_strings = tokenizer.decode_batch(sequences) - assert prompts == decoded_strings - else: - for prompt in prompts: - sequence = tokenizer.encode(prompt) - decoded_string = tokenizer.decode(sequence) - assert prompt == decoded_string - - -@pytest.mark.skipif( - sysconfig.get_platform().endswith("arm64") or sys.version_info.minor < 8, - reason="Python 3.8 is required for downloading models.", -) -@pytest.mark.parametrize( - "device", ["cpu", "cuda"] if og.is_cuda_available() else ["cpu"] -) -def test_tokenizer_stream(device, phi2_for): - model = og.Model(phi2_for(device)) - tokenizer = og.Tokenizer(model) - tokenizer_stream = tokenizer.create_stream() - - prompts = [ - "This is a test.", - "Rats are awesome pets!", - "The quick brown fox jumps over the lazy dog.", - ] - - for prompt in prompts: - sequence = tokenizer.encode(prompt) - decoded_string = "" - for token in sequence: - decoded_string += tokenizer_stream.decode(token) - - assert decoded_string == prompt - - # TODO: CUDA pipelines use python3.6 and do not have a way to download models since downloading models # requires pytorch and hf transformers. This test should be re-enabled once the pipeline is updated. @pytest.mark.skipif(