From e13e345a28ad86e22f3440487e11c9965a877a2a Mon Sep 17 00:00:00 2001 From: David Gardner <96306125+dagardner-nv@users.noreply.github.com> Date: Fri, 18 Oct 2024 13:06:22 -0700 Subject: [PATCH] Add support for a CPU-only Mode (#1851) * Adds a new enum `morpheus.config.ExecutionMode` with members `GPU` & `CPU` along with a new `morpheus.config.Config.execution_mode` attribute. * For backwards compatibility, by default `Config.execution_mode` will always default to `GPU` * Add new `supported_execution_modes` to `StageBase` which returns `ExecutionMode.GPU` by default. This ensures that building a pipeline with a stage not matching the execution mode will raise a reasonable error to the user. * Add `CpuOnlyMixin` and `GpuAndCpuMixin` mixins to automate overriding this, and makes it easier for users to determine which execution modes a given stage supports at a glance. * Since C++ Stage/Message impls can only support cuDF DataFrames, and RMM tensors, this PR re-purposes the existing Python stage/message impls mode to serve as CPU-only mode. * CPU-only mode will center around pandas DataFrames and NumPy arrays for tensors, since the current Python code which expects cuDF/CuPy is already 99% compatible with pandas/NumPy. * Avoid importing `cudf` or any other GPU based package which will fail on import at the top-level of a module. This is important for stage, message and modules which are automatically imported by the morpheus CLI tool. * Add new utility methods to `morpheus.utils.type_utils` (ex: `get_df_pkg`, `is_cudf_type`) to help avoid importing cudf directly * Add a new `Config.freeze` method which will make a config object immutable. This will be called the first time a config object is used to construct a pipeline or stage object. Prevents the possibility of config parameters from being changed in the middle of pipeline construction. * `CudfHelper::load` is no longer called automatically on import, instead it is called manually on pipeline build when execution mode is GPU. * Add Python implementation of `ControlMessage` * To simulate a system without a GPU to test CPU-only mode, if the `CPU_ONLY` environment variable is defined `docker/run_container_dev.sh` will launch the container using the `runc` runtime. * Remove automatic test parameterization of C++/Python mode, since supporting CPU-only mode will become the exception not the rule. Add a new `gpu_and_cpu_mode` test marker to explicitly indicate a test intended to be parameterized over execution modes. * Fix copy constructor for `ControlMessage` * `AppShieldSourceStage` now emits `ControlMessage`s, `AppShieldMessageMeta` is now deprecated * `AutoencoderSourceStage` and thus `AzureSourceStage`, `CloudTrailSourceStage`, and `DuoSourceStage` now emit `ControlMessage`, `UserMessageMeta` is now deprecated. * DFP production pipeline updated to remove `DFPMessageMeta`, pipeline now executes in C++ mode. * Consolidate common logig in `docker/run_container_dev.sh` & `docker/run_container_release.sh` into `docker/run_container.sh` * Remove inconsistent behavior in the Python impl of `TensorMemory.set_tensor` (#1955) Closes #1646 Closes #1846 Closes #1852 Closes #1955 ## By Submitting this PR I confirm: - I am familiar with the [Contributing Guidelines](https://github.com/nv-morpheus/Morpheus/blob/main/docs/source/developer_guide/contributing.md). - When the PR is ready for review, new or existing tests cover these changes. - When the PR is ready for review, the documentation is up to date with these changes. Authors: - David Gardner (https://github.com/dagardner-nv) - Yuchen Zhang (https://github.com/yczhang-nv) Approvers: - Michael Demoret (https://github.com/mdemoret-nv) URL: https://github.com/nv-morpheus/Morpheus/pull/1851 --- .../config/vocabularies/morpheus/accept.txt | 5 + docker/run_container.sh | 57 +++++ docker/run_container_dev.sh | 38 +-- docker/run_container_release.sh | 39 +-- docs/source/conf.py | 1 + .../guides/2_real_world_phishing.md | 5 +- .../6_digital_fingerprinting_reference.md | 10 +- .../abp_pcap_preprocessing.py | 8 +- examples/cpu_only/run.py | 140 +++++++++++ .../1_simple_python_stage/pass_thru.py | 3 +- .../1_simple_python_stage/pass_thru_deco.py | 3 +- .../2_1_real_world_phishing/run.py | 2 +- .../developer_guide/2_2_rabbitmq/README.md | 1 + .../2_2_rabbitmq/rabbitmq_source_stage.py | 11 +- .../rabbitmq_source_stage_deco.py | 10 +- .../2_2_rabbitmq/read_simple.py | 13 +- .../2_2_rabbitmq/write_simple.py | 21 +- .../2_2_rabbitmq/write_to_rabbitmq_stage.py | 3 +- .../src/simple_cpp_stage/pass_thru.py | 3 +- .../rabbitmq_source_stage.py | 11 +- .../write_to_rabbitmq_stage.py | 3 +- .../4_rabbitmq_cpp_stage/src/read_simple.py | 9 +- .../4_rabbitmq_cpp_stage/src/write_simple.py | 21 +- .../production/dfp_azure_pipeline.py | 4 - .../production/dfp_duo_pipeline.py | 4 - .../dfp_integrated_training_batch_pipeline.py | 9 +- ..._integrated_training_streaming_pipeline.py | 9 +- .../production/grafana/run.py | 4 - .../production/morpheus/benchmarks/README.md | 10 +- .../benchmarks/benchmark_conf_generator.py | 1 - .../benchmarks/resource/pipelines_conf.json | 60 ++--- .../notebooks/dfp_azure_inference.ipynb | 4 - .../dfp_azure_integrated_training.ipynb | 3 +- .../notebooks/dfp_azure_training.ipynb | 4 - .../notebooks/dfp_duo_inference.ipynb | 4 - .../dfp_duo_integrated_training.ipynb | 1 - .../morpheus/notebooks/dfp_duo_training.ipynb | 4 - .../digital_fingerprinting/starter/README.md | 121 +++++----- .../starter/run_cloudtrail_dfp.py | 9 +- .../visualization/dfp_viz_azure_pipeline.py | 4 - .../visualization/dfp_viz_duo_pipeline.py | 4 - examples/doca/run_tcp.py | 3 - examples/doca/run_udp_convert.py | 3 - examples/doca/run_udp_raw.py | 3 - examples/doca/vdb_realtime/vdb.py | 3 - examples/gnn_fraud_detection_pipeline/run.py | 3 - examples/llm/cli.py | 10 +- examples/llm/vdb_upload/run.py | 2 +- examples/log_parsing/inference.py | 3 +- examples/ransomware_detection/README.md | 2 +- examples/ransomware_detection/run.py | 6 +- .../stages/create_features.py | 68 +++--- .../stages/preprocessing.py | 35 ++- examples/sid_visualization/README.md | 4 +- examples/sid_visualization/run.py | 8 +- external/utilities | 2 +- models/model-cards/dfp-model-card.md | 2 +- morpheus.code-workspace | 12 + pyproject.toml | 5 +- .../morpheus/morpheus/_lib/common/module.cpp | 4 - .../include/morpheus/messages/control.hpp | 137 +---------- .../include/morpheus/utilities/cudf_util.hpp | 5 +- .../morpheus/_lib/messages/__init__.pyi | 2 +- .../morpheus/_lib/messages/module.cpp | 10 +- .../morpheus/_lib/src/messages/control.cpp | 115 +++++++-- .../morpheus/_lib/src/messages/meta.cpp | 3 +- .../morpheus/_lib/src/utilities/cudf_util.cpp | 10 +- .../morpheus/morpheus/_lib/stages/module.cpp | 4 - .../_lib/tests/messages/test_dev_doc_ex3.cpp | 22 +- .../_lib/tests/messages/test_messages.hpp | 17 +- .../stages/test_triton_inference_stage.cpp | 17 +- .../morpheus/_lib/tests/test_file_in_out.cpp | 16 +- .../morpheus/_lib/tests/test_utils/common.cpp | 4 - python/morpheus/morpheus/cli/commands.py | 38 ++- python/morpheus/morpheus/config.py | 41 ++++ .../controllers/file_to_df_controller.py | 6 +- .../filter_detections_controller.py | 13 +- .../mlflow_model_writer_controller.py | 11 +- .../controllers/monitor_controller.py | 18 +- .../morpheus/controllers/rss_controller.py | 15 +- python/morpheus/morpheus/io/deserializers.py | 47 ++-- python/morpheus/morpheus/io/serializers.py | 12 +- python/morpheus/morpheus/io/utils.py | 98 +++++++- python/morpheus/morpheus/messages/__init__.py | 4 +- .../morpheus/messages/control_message.py | 203 ++++++++++++++++ .../messages/memory/inference_memory.py | 55 +++-- .../messages/memory/response_memory.py | 27 ++- .../morpheus/messages/memory/tensor_memory.py | 38 ++- .../morpheus/messages/message_meta.py | 61 ++--- .../morpheus/modules/filter_detections.py | 2 +- .../morpheus/modules/payload_batcher.py | 23 +- .../morpheus/morpheus/parsers/event_parser.py | 26 +- python/morpheus/morpheus/parsers/ip.py | 227 ++++++++++-------- .../morpheus/morpheus/parsers/url_parser.py | 70 ++++-- .../morpheus/parsers/windows_event_parser.py | 21 +- python/morpheus/morpheus/parsers/zeek.py | 15 +- .../pipeline/execution_mode_mixins.py | 69 ++++++ .../morpheus/pipeline/linear_pipeline.py | 1 + python/morpheus/morpheus/pipeline/pipeline.py | 3 + .../morpheus/pipeline/preallocator_mixin.py | 27 ++- .../morpheus/pipeline/single_port_stage.py | 1 - .../morpheus/morpheus/pipeline/stage_base.py | 23 ++ .../morpheus/pipeline/stage_decorator.py | 87 +++++-- .../stages/boundary/linear_boundary_stage.py | 5 +- .../morpheus/stages/general/monitor_stage.py | 3 +- .../stages/general/multi_processing_stage.py | 7 + .../morpheus/stages/general/trigger_stage.py | 3 +- .../inference/auto_encoder_inference_stage.py | 5 +- .../stages/inference/inference_stage.py | 37 +-- .../inference/triton_inference_stage.py | 10 +- .../stages/input/appshield_source_stage.py | 81 ++++--- .../morpheus/stages/input/arxiv_source.py | 15 +- .../stages/input/autoencoder_source_stage.py | 78 +++--- .../databricks_deltalake_source_stage.py | 19 +- .../stages/input/file_source_stage.py | 12 +- .../stages/input/http_client_source_stage.py | 26 +- .../stages/input/http_server_source_stage.py | 29 ++- .../input/in_memory_data_generation_stage.py | 5 +- .../stages/input/in_memory_source_stage.py | 7 +- .../stages/input/kafka_source_stage.py | 15 +- .../morpheus/stages/input/rss_source_stage.py | 8 +- .../stages/output/compare_dataframe_stage.py | 7 +- .../stages/output/http_client_sink_stage.py | 3 +- .../stages/output/http_server_sink_stage.py | 14 +- .../stages/output/in_memory_sink_stage.py | 3 +- .../write_to_databricks_deltalake_stage.py | 5 +- .../output/write_to_elasticsearch_stage.py | 5 +- .../stages/output/write_to_file_stage.py | 5 +- .../stages/output/write_to_kafka_stage.py | 3 +- .../postprocess/add_classifications_stage.py | 2 +- .../postprocess/add_scores_stage_base.py | 3 +- .../postprocess/filter_detections_stage.py | 5 +- .../postprocess/generate_viz_frames_stage.py | 12 +- .../stages/postprocess/serialize_stage.py | 5 +- .../stages/postprocess/timeseries_stage.py | 4 +- .../stages/postprocess/validation_stage.py | 2 +- .../stages/preprocess/deserialize_stage.py | 23 +- .../stages/preprocess/drop_null_stage.py | 21 +- .../preprocess/group_by_column_stage.py | 3 +- .../stages/preprocess/preprocess_ae_stage.py | 10 +- .../preprocess/preprocess_base_stage.py | 19 +- .../stages/preprocess/preprocess_fil_stage.py | 67 +----- .../stages/preprocess/preprocess_nlp_stage.py | 99 +------- .../stages/preprocess/train_ae_stage.py | 67 +++--- python/morpheus/morpheus/utils/column_info.py | 29 +-- python/morpheus/morpheus/utils/concat_df.py | 7 +- .../morpheus/morpheus/utils/module_utils.py | 14 +- .../morpheus/utils/schema_transforms.py | 23 +- python/morpheus/morpheus/utils/seed.py | 23 +- .../morpheus/morpheus/utils/type_aliases.py | 15 +- python/morpheus/morpheus/utils/type_utils.py | 134 +++++++++++ .../morpheus_dfp/messages/__init__.py | 13 - .../morpheus_dfp/messages/dfp_message_meta.py | 42 ---- .../morpheus_dfp/modules/dfp_inference.py | 10 +- .../morpheus_dfp/modules/dfp_training.py | 7 +- .../stages/dfp_rolling_window_stage.py | 24 +- .../stages/dfp_split_users_stage.py | 32 +-- .../morpheus_dfp/utils/config_generator.py | 5 - .../morpheus_llm/_lib/llm/module.cpp | 4 - .../modules/output/write_to_vector_db.py | 16 +- .../service/vdb/faiss_vdb_service.py | 22 +- .../service/vdb/milvus_vector_db_service.py | 9 +- .../service/vdb/vector_db_service.py | 19 +- .../stages/llm/llm_engine_stage.py | 24 +- tests/_utils/dataset_manager.py | 46 ++-- tests/_utils/inference_worker.py | 2 - tests/_utils/stages/check_pre_alloc.py | 11 +- .../stages/control_message_pass_thru.py | 9 +- tests/_utils/stages/conv_msg.py | 35 +-- tests/_utils/stages/dfp_length_checker.py | 3 +- tests/_utils/stages/error_raiser.py | 3 +- .../stages/in_memory_multi_source_stage.py | 3 +- .../_utils/stages/in_memory_source_x_stage.py | 3 +- tests/_utils/stages/multi_port_pass_thru.py | 3 +- tests/_utils/stages/record_thread_id_stage.py | 3 +- tests/_utils/stages/split_stage.py | 3 +- .../test_bench_agents_simple_pipeline.py | 2 +- .../test_bench_completion_pipeline.py | 2 +- .../test_bench_rag_standalone_pipeline.py | 2 +- .../test_bench_vdb_upload_pipeline.py | 2 +- tests/conftest.py | 218 +++++++---------- .../developer_guide/test_pass_thru.py | 40 +-- .../gnn_fraud_detection_pipeline/conftest.py | 2 +- .../test_classification_stage.py | 2 +- .../test_graph_construction_stage.py | 2 +- .../test_graph_sage_stage.py | 2 +- .../common/test_content_extractor_module.py | 2 - .../llm/common/test_web_scraper_module.py | 2 - .../llm/common/test_web_scraper_stage.py | 2 - .../test_schema_transform_module.py | 2 - tests/examples/log_parsing/conftest.py | 2 +- tests/examples/log_parsing/test_inference.py | 24 +- .../log_parsing/test_postprocessing.py | 4 +- .../examples/ransomware_detection/conftest.py | 2 +- .../test_create_features.py | 101 +++----- .../test_preprocessing.py | 29 ++- tests/morpheus/apps/test_abp.py | 154 +----------- tests/morpheus/apps/test_abp_kafka.py | 98 +------- tests/morpheus/apps/test_phishing.py | 73 +----- tests/morpheus/apps/test_phishing_kafka.py | 101 +------- tests/morpheus/apps/test_sid.py | 2 +- tests/morpheus/apps/test_sid_kafka.py | 100 +------- .../test_elasticsearch_controller.py | 6 - tests/morpheus/dfencoder/test_autoencoder.py | 2 +- tests/morpheus/dfencoder/test_pkg.py | 26 -- tests/morpheus/io/test_io_utils.py | 16 ++ .../morpheus/messages/test_control_message.py | 133 +++++----- tests/morpheus/messages/test_message_meta.py | 16 +- tests/morpheus/messages/test_messages.py | 18 +- tests/morpheus/messages/test_tensor_memory.py | 123 +++++----- .../modules/test_from_control_message.py | 4 +- .../morpheus/modules/test_payload_batcher.py | 4 +- .../modules/test_to_control_message.py | 2 +- .../parsers/test_windows_event_parser.py | 1 + tests/morpheus/pipeline/test_error_pipe.py | 7 +- .../morpheus/pipeline/test_execution_modes.py | 148 ++++++++++++ tests/morpheus/pipeline/test_file_in_out.py | 9 +- tests/morpheus/pipeline/test_pipe_viz.py | 9 +- tests/morpheus/pipeline/test_pipeline.py | 2 +- .../pipeline/test_preallocation_pipe.py | 10 +- .../morpheus/pipeline/test_stage_decorator.py | 34 +-- .../stages/test_add_classifications_stage.py | 22 +- .../morpheus/stages/test_add_scores_stage.py | 21 +- .../stages/test_appshield_source_stage.py | 30 ++- .../stages/test_deserialize_stage_pipe.py | 2 +- .../morpheus/stages/test_file_source_stage.py | 31 +++ .../stages/test_file_source_stage_pipe.py | 4 +- .../stages/test_filter_detections_stage.py | 65 +++-- .../test_filter_detections_stage_pipe.py | 10 +- .../stages/test_generate_viz_frames_stage.py | 4 +- .../stages/test_http_server_sink_stage.py | 2 +- .../stages/test_http_server_source_stage.py | 7 +- tests/morpheus/stages/test_inference_stage.py | 25 +- .../stages/test_kafka_source_stage_pipe.py | 4 + .../stages/test_linear_modules_stage.py | 26 +- .../stages/test_ml_flow_drift_stage.py | 7 +- tests/morpheus/stages/test_monitor_stage.py | 2 +- .../stages/test_multi_port_modules_stage.py | 2 +- .../stages/test_multi_processing_stage.py | 35 ++- .../stages/test_preprocess_fil_stage.py | 21 +- .../stages/test_preprocess_nlp_stage.py | 36 +-- .../stages/test_rss_source_stage_pipe.py | 14 +- tests/morpheus/stages/test_serialize_stage.py | 13 +- .../morpheus/stages/test_timeseries_stage.py | 7 +- .../stages/test_triton_inference_stage.py | 25 +- .../test_write_to_elasticsearch_stage_pipe.py | 17 +- .../stages/test_write_to_file_stage.py | 48 ---- .../stages/test_write_to_kafka_stage_pipe.py | 15 +- tests/morpheus/test_cli.py | 7 +- tests/morpheus/test_config.py | 45 ++++ tests/morpheus/utils/test_column_info.py | 17 -- .../morpheus/utils/test_directory_watcher.py | 1 - tests/morpheus/utils/test_inference_worker.py | 6 +- tests/morpheus/utils/test_module_utils.py | 1 - tests/morpheus/utils/test_type_utils.py | 107 +++++++++ tests/morpheus_dfp/conftest.py | 26 +- .../morpheus_dfp/modules/test_dfp_training.py | 9 +- .../stages/test_dfp_mlflow_model_writer.py | 7 +- .../stages/test_dfp_rolling_window_stage.py | 80 +++--- .../stages/test_dfp_split_users_stage.py | 26 +- tests/morpheus_dfp/test_dfp.py | 16 +- tests/morpheus_dfp/test_dfp_kafka.py | 9 +- .../morpheus_llm/llm/test_vdb_upload_pipe.py | 1 - .../stages/test_llm_engine_stage_pipe.py | 4 - ...st_milvus_write_to_vector_db_stage_pipe.py | 2 +- tests/test_conftest.py | 155 ++++++------ 266 files changed, 3257 insertions(+), 3145 deletions(-) create mode 100755 docker/run_container.sh create mode 100644 examples/cpu_only/run.py create mode 100644 python/morpheus/morpheus/messages/control_message.py create mode 100644 python/morpheus/morpheus/pipeline/execution_mode_mixins.py delete mode 100644 python/morpheus_dfp/morpheus_dfp/messages/__init__.py delete mode 100644 python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py delete mode 100755 tests/morpheus/dfencoder/test_pkg.py create mode 100755 tests/morpheus/pipeline/test_execution_modes.py create mode 100755 tests/morpheus/stages/test_file_source_stage.py delete mode 100755 tests/morpheus/stages/test_write_to_file_stage.py create mode 100644 tests/morpheus/utils/test_type_utils.py diff --git a/ci/vale/styles/config/vocabularies/morpheus/accept.txt b/ci/vale/styles/config/vocabularies/morpheus/accept.txt index 157edebd18..285a85c7d8 100644 --- a/ci/vale/styles/config/vocabularies/morpheus/accept.txt +++ b/ci/vale/styles/config/vocabularies/morpheus/accept.txt @@ -18,6 +18,9 @@ CMake Conda CPython [Cc]ryptocurrenc[y|ies] +cuDF +cuML +CuPy [Cc]yber [Cc]ybersecurity Cython @@ -51,7 +54,9 @@ NeMo nginx NIC NIM(s?) +NumPy NVIDIA +pandas [Pp]arallelization [Pp]arsable PCIe diff --git a/docker/run_container.sh b/docker/run_container.sh new file mode 100755 index 0000000000..7d368556ef --- /dev/null +++ b/docker/run_container.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Color variables +b="\033[0;36m" +g="\033[0;32m" +r="\033[0;31m" +e="\033[0;90m" +y="\033[0;33m" +x="\033[0m" + +_UNDEF_VAR_ERROR_MSG="Use the dev/release scripts to set these automatically" + +DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:?"Must set \$DOCKER_IMAGE_NAME. ${_UNDEF_VAR_ERROR_MSG}"} +DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:?"Must set \$DOCKER_IMAGE_TAG. ${_UNDEF_VAR_ERROR_MSG}"} + +# DOCKER_ARGS are set by the dev/release scripts +# DOCKER_EXTRA_ARGS are optionally set by the user +DOCKER_ARGS=${DOCKER_ARGS:-""} +DOCKER_ARGS="${DOCKER_ARGS} --net=host --cap-add=sys_nice ${DOCKER_EXTRA_ARGS}" +DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} + +if [[ -n "${CPU_ONLY}" ]]; then + echo -e "${b}Executing in CPU only mode${x}" + DOCKER_ARGS="${DOCKER_ARGS} --runtime=runc" +else + echo -e "${b}Executing in GPU mode${x}" + DOCKER_ARGS="${DOCKER_ARGS} --runtime=nvidia --gpus=all" +fi + +if [[ -n "${SSH_AUTH_SOCK}" ]]; then + echo -e "${b}Setting up ssh-agent auth socket${x}" + DOCKER_ARGS="${DOCKER_ARGS} -v $(readlink -f $SSH_AUTH_SOCK):/ssh-agent:ro -e SSH_AUTH_SOCK=/ssh-agent" +fi + +echo -e "${g}Launching ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}...${x}" + +# Enable command logging to show what is being executed +set -x +docker run ${DOCA_EXTRA_ARGS} --rm -ti ${DOCKER_ARGS} ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" + +{ EXIT_CODE=$?; set +x; } 2>/dev/null + +exit $EXIT_CODE diff --git a/docker/run_container_dev.sh b/docker/run_container_dev.sh index 9a2db756af..0caa949c80 100755 --- a/docker/run_container_dev.sh +++ b/docker/run_container_dev.sh @@ -14,38 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -# set -x +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -# Color variables -b="\033[0;36m" -g="\033[0;32m" -r="\033[0;31m" -e="\033[0;90m" -y="\033[0;33m" -x="\033[0m" +export DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"morpheus"} +export DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"dev-$(date +'%y%m%d')"} -DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"morpheus"} -DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"dev-$(date +'%y%m%d')"} -DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} +export DOCKER_ARGS="-v $PWD:/workspace -v /dev/hugepages:/dev/hugepages --privileged" -DOCKER_ARGS="--runtime=nvidia --env WORKSPACE_VOLUME=${PWD} -v $PWD:/workspace --net=host --gpus=all --cap-add=sys_nice" - -if [[ -n "${SSH_AUTH_SOCK}" ]]; then - echo -e "${b}Setting up ssh-agent auth socket${x}" - DOCKER_ARGS="${DOCKER_ARGS} -v $(readlink -f $SSH_AUTH_SOCK):/ssh-agent:ro -e SSH_AUTH_SOCK=/ssh-agent" -fi - -echo -e "${g}Launching ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}...${x}" - -set -x -docker run \ - -v /dev/hugepages:/dev/hugepages \ - --privileged \ - --rm \ - -ti \ - ${DOCKER_ARGS} ${DOCKER_EXTRA_ARGS} \ - ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" - -{ EXIT_CODE=$?; set +x; } 2>/dev/null - -exit $EXIT_CODE +# Call the general run script +${SCRIPT_DIR}/run_container.sh diff --git a/docker/run_container_release.sh b/docker/run_container_release.sh index dce2132b1a..5ea4e3fd74 100755 --- a/docker/run_container_release.sh +++ b/docker/run_container_release.sh @@ -16,48 +16,23 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -# Color variables -b="\033[0;36m" -g="\033[0;32m" -r="\033[0;31m" -e="\033[0;90m" -y="\033[0;33m" -x="\033[0m" - # Change to the script file to ensure we are in the correct repo (in case were in a submodule) pushd ${SCRIPT_DIR} &> /dev/null MORPHEUS_SUPPORT_DOCA=${MORPHEUS_SUPPORT_DOCA:-OFF} -MORPHEUS_BUILD_MORPHEUS_LLM=${MORPHEUS_BUILD_MORPHEUS_LLM:-ON} -MORPHEUS_BUILD_MORPHEUS_DFP=${MORPHEUS_BUILD_MORPHEUS_DFP:-ON} - -DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"nvcr.io/nvidia/morpheus/morpheus"} -DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"$(git describe --tags --abbrev=0)-runtime"} -# This variable is used for passing extra arguments to the docker run command. Do not use DOCKER_ARGS for this purpose. -DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} +export DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"nvcr.io/nvidia/morpheus/morpheus"} +export DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"$(git describe --tags --abbrev=0)-runtime"} popd &> /dev/null -DOCKER_ARGS="--runtime=nvidia --env WORKSPACE_VOLUME=${PWD} --net=host --gpus=all --cap-add=sys_nice ${DOCKER_EXTRA_ARGS}" - -if [[ -n "${SSH_AUTH_SOCK}" ]]; then - echo -e "${b}Setting up ssh-agent auth socket${x}" - DOCKER_ARGS="${DOCKER_ARGS} -v $(readlink -f $SSH_AUTH_SOCK):/ssh-agent:ro -e SSH_AUTH_SOCK=/ssh-agent" -fi - -# DPDK requires hugepage and privileged container -DOCA_EXTRA_ARGS="" +# DPDK (and thus DOCA) requires hugepage and privileged container +export DOCKER_ARGS="" if [[ ${MORPHEUS_SUPPORT_DOCA} == @(TRUE|ON) ]]; then - echo -e "${b}Enabling DOCA Support. Mounting /dev/hugepages and running in privileged mode${x}" + echo -e "Enabling DOCA Support. Mounting /dev/hugepages and running in privileged mode" DOCKER_ARGS="${DOCKER_ARGS} -v /dev/hugepages:/dev/hugepages --privileged" fi - -echo -e "${g}Launching ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}...${x}" - -# Enable command logging to show what is being executed -set -x -docker run ${DOCA_EXTRA_ARGS} --rm -ti ${DOCKER_ARGS} ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" -set +x +# Call the general run script +${SCRIPT_DIR}/run_container.sh diff --git a/docs/source/conf.py b/docs/source/conf.py index aa59786e26..d743df18e1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -199,6 +199,7 @@ r'^http://$', r'^https://$', r'https://(platform\.)?openai.com', + r'https://code.visualstudio.com' ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/developer_guide/guides/2_real_world_phishing.md b/docs/source/developer_guide/guides/2_real_world_phishing.md index c821d16a0c..0d27f0de98 100644 --- a/docs/source/developer_guide/guides/2_real_world_phishing.md +++ b/docs/source/developer_guide/guides/2_real_world_phishing.md @@ -980,7 +980,7 @@ The code for our sink will be similar to other stages with a few changes. First, ```python @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): ``` Our sink will function as a pass-through allowing the possibility of other sinks to be added to the pipeline. We could, hypothetically, have a pipeline where we emit the results to both RabbitMQ and a file. For this reason we will also be using the `PassThruTypeMixin`. @@ -1032,6 +1032,7 @@ import pika from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -1039,7 +1040,7 @@ logger = logging.getLogger(__name__) @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Source stage used to load messages from a RabbitMQ queue. diff --git a/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md b/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md index 4f60bcf155..b9a2e3a786 100644 --- a/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md +++ b/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md @@ -88,7 +88,7 @@ Defines a single column and type-cast. | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | #### Custom Column (`CustomColumn`) Subclass of `ColumnInfo`, defines a column to be computed by a user-defined function `process_column_fn`. @@ -96,7 +96,7 @@ Subclass of `ColumnInfo`, defines a column to be computed by a user-defined func | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `process_column_fn` | `function` | Function which receives the entire `DataFrame` as its only input, returning a new [`pandas.Series`](https://pandas.pydata.org/docs/reference/api/pandas.Series.html) object to be stored in column `name`. | | `input_column_types` | `dict[str, str]` | The input columns and the expected [`dtype` strings](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) that are needed for this Column to successfully process. Setting this as `None` will pass all columns. Specifying which columns are needed improves performance. | @@ -139,7 +139,7 @@ Subclass of `RenameColumn`, specific to casting UTC localized `datetime` values. | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the destination column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `input_name` | `str` | Original column name | #### String-Join Column (`StringJoinColumn`) @@ -148,7 +148,7 @@ Subclass of `RenameColumn`, converts incoming `list` values to string by joining | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the destination column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `input_name` | `str` | Original column name | | `sep` | `str` | Separator string to use for the join | @@ -158,7 +158,7 @@ Subclass of `ColumnInfo`, concatenates values from multiple columns into a new s | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the destination column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `input_columns` | `List[str]` | List of columns to concatenate | | `sep` | `str` | Separator string | diff --git a/examples/abp_pcap_detection/abp_pcap_preprocessing.py b/examples/abp_pcap_detection/abp_pcap_preprocessing.py index f4ebdfbb04..ebc0392217 100644 --- a/examples/abp_pcap_detection/abp_pcap_preprocessing.py +++ b/examples/abp_pcap_detection/abp_pcap_preprocessing.py @@ -16,17 +16,16 @@ from functools import partial import cupy as cp -import mrc import numpy as np import cudf -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemoryFIL from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage @@ -184,7 +183,7 @@ def round_time_kernel(timestamp, rollup_time, secs): seq_ids[:, 2] = fea_len - 1 # Create the inference memory. Keep in mind count here could be > than input count - memory = _messages.InferenceMemoryFIL(count=count, input__0=data, seq_ids=seq_ids) + memory = InferenceMemoryFIL(count=count, input__0=data, seq_ids=seq_ids) infer_message = ControlMessage(msg) infer_message.payload(meta) @@ -197,6 +196,3 @@ def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage fea_len=self._fea_length, fea_cols=self.features, req_cols=self.req_cols) - - def _get_preprocess_node(self, builder: mrc.Builder): - raise NotImplementedError("C++ node not implemented for this stage") diff --git a/examples/cpu_only/run.py b/examples/cpu_only/run.py new file mode 100644 index 0000000000..f0a50a47e0 --- /dev/null +++ b/examples/cpu_only/run.py @@ -0,0 +1,140 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import pathlib +import sys +import typing + +import click + +from morpheus.cli.utils import get_log_levels +from morpheus.cli.utils import parse_log_level +from morpheus.config import Config +from morpheus.config import CppConfig +from morpheus.config import ExecutionMode +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta +from morpheus.pipeline.linear_pipeline import LinearPipeline +from morpheus.pipeline.stage_decorator import stage +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.general.trigger_stage import TriggerStage +from morpheus.stages.input.file_source_stage import FileSourceStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.stages.postprocess.serialize_stage import SerializeStage +from morpheus.stages.preprocess.deserialize_stage import DeserializeStage +from morpheus.utils.logger import configure_logging + +logger = logging.getLogger(f"morpheus.{__name__}") + + +@click.command() +@click.option('--use_cpu_only', + default=False, + type=bool, + is_flag=True, + help=("Whether or not to run in CPU only mode, setting this to True will disable C++ mode.")) +@click.option("--log_level", + default="DEBUG", + type=click.Choice(get_log_levels(), case_sensitive=False), + callback=parse_log_level, + show_default=True, + help="Specify the logging level to use.") +@click.option( + "--in_file", + help="Input file", + required=True, + type=click.Path(exists=True, readable=True), +) +@click.option( + "--out_file", + help="Output file", + type=click.Path(dir_okay=False), + default="output.csv", + required=True, +) +def run_pipeline(log_level: int, use_cpu_only: bool, in_file: pathlib.Path, out_file: pathlib.Path): + # Enable the default logger + configure_logging(log_level=log_level) + + if use_cpu_only: + execution_mode = ExecutionMode.CPU + else: + execution_mode = ExecutionMode.GPU + + config = Config() + config.execution_mode = execution_mode + + pipeline = LinearPipeline(config) + + pipeline.set_source(FileSourceStage(config, filename=in_file)) + + pipeline.add_stage(MonitorStage(config, description="source")) + + pipeline.add_stage(TriggerStage(config)) + + @stage(execution_modes=(execution_mode, )) + def print_msg(msg: typing.Any) -> typing.Any: + log_msg = [f"Receive a message of type {type(msg)}"] + if isinstance(msg, MessageMeta): + log_msg.append(f"- df type: {type(msg.df)}") + + logger.debug(" ".join(log_msg)) + + return msg + + pipeline.add_stage(print_msg(config)) + + pipeline.add_stage(DeserializeStage(config)) + + pipeline.add_stage(MonitorStage(config, description="deserialize")) + + @stage(execution_modes=(execution_mode, )) + def calculate_totals(msg: ControlMessage, *, total_column_name: str = "total") -> ControlMessage: + meta = msg.payload() + + with meta.mutable_dataframe() as df: + logger.debug("Received a ControlMessage with a dataframe of type %s", type(df)) + df[total_column_name] = df.select_dtypes(include="number").sum(axis=1) + + return msg + + pipeline.add_stage(calculate_totals(config)) + pipeline.add_stage(SerializeStage(config)) + pipeline.add_stage(WriteToFileStage(config, filename=out_file, overwrite=True)) + pipeline.build() + + logger.info("Running pipeline\tC++ mode = %s\texecution_mode = %s", + CppConfig.get_should_use_cpp(), + config.execution_mode) + + pipeline.run() + + known_gpu_packages = ['cudf', 'cuml', 'tensorrt', 'torch'] + known_gpu_packages_loaded = [pkg in sys.modules for pkg in known_gpu_packages] + + if any(known_gpu_packages_loaded): + for (i, pkg) in enumerate(known_gpu_packages): + if known_gpu_packages_loaded[i]: + msg = f"{pkg} is loaded" + if use_cpu_only: + logger.error(msg) + else: + logger.info(msg) + else: + logger.info("No GPU packages loaded") + + +if __name__ == "__main__": + run_pipeline() diff --git a/examples/developer_guide/1_simple_python_stage/pass_thru.py b/examples/developer_guide/1_simple_python_stage/pass_thru.py index 7e6a8e125c..52edba71e7 100644 --- a/examples/developer_guide/1_simple_python_stage/pass_thru.py +++ b/examples/developer_guide/1_simple_python_stage/pass_thru.py @@ -19,12 +19,13 @@ from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @register_stage("pass-thru") -class PassThruStage(PassThruTypeMixin, SinglePortStage): +class PassThruStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ A Simple Pass Through Stage """ diff --git a/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py b/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py index da9c51fa9a..9755f63765 100644 --- a/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py +++ b/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py @@ -15,10 +15,11 @@ import typing +from morpheus.config import ExecutionMode from morpheus.pipeline.stage_decorator import stage -@stage +@stage(execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) def pass_thru_stage(message: typing.Any) -> typing.Any: # Return the message for the next stage return message diff --git a/examples/developer_guide/2_1_real_world_phishing/run.py b/examples/developer_guide/2_1_real_world_phishing/run.py index b0907924aa..32e53042f7 100755 --- a/examples/developer_guide/2_1_real_world_phishing/run.py +++ b/examples/developer_guide/2_1_real_world_phishing/run.py @@ -75,7 +75,7 @@ default="phishing-bert-onnx", help="The name of the model that is deployed on Tritonserver.", ) -@click.option("--server_url", default='localhost:8001', help="Tritonserver url.") +@click.option("--server_url", default='localhost:8000', help="Tritonserver url.") @click.option( "--output_file", default=os.path.join(tempfile.gettempdir(), "detections.jsonlines"), diff --git a/examples/developer_guide/2_2_rabbitmq/README.md b/examples/developer_guide/2_2_rabbitmq/README.md index cadd6075a2..5b657b580f 100644 --- a/examples/developer_guide/2_2_rabbitmq/README.md +++ b/examples/developer_guide/2_2_rabbitmq/README.md @@ -54,6 +54,7 @@ If no exchange named 'logs' exists in RabbitMQ it will be created. By default th ## Launch the writer In a third terminal from the root of the Morpheus repo execute: ```bash +export MORPHEUS_ROOT=$(pwd) python examples/developer_guide/2_2_rabbitmq/write_simple.py ``` diff --git a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py index 347d02e131..182e9e556f 100644 --- a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py +++ b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py @@ -22,20 +22,20 @@ import pandas as pd import pika -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_utils import get_df_pkg logger = logging.getLogger(__name__) @register_stage("from-rabbitmq") -class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): +class RabbitMQSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Source stage used to load messages from a RabbitMQ queue. @@ -77,6 +77,9 @@ def __init__(self, self._poll_interval = pd.Timedelta(poll_interval) + # This will return either cudf.DataFrame or pandas.DataFrame depending on the execution mode + self._df_pkg = get_df_pkg(config.execution_mode) + @property def name(self) -> str: return "from-rabbitmq" @@ -97,7 +100,7 @@ def source_generator(self, subscription: mrc.Subscription) -> collections.abc.It if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = self._df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) diff --git a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py index de24cf9873..58255bf557 100644 --- a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py +++ b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py @@ -22,15 +22,15 @@ import pandas as pd import pika -import cudf - +from morpheus.config import ExecutionMode from morpheus.messages.message_meta import MessageMeta from morpheus.pipeline.stage_decorator import source +from morpheus.utils.type_utils import get_df_pkg logger = logging.getLogger(__name__) -@source(name="from-rabbitmq") +@source(name="from-rabbitmq", execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) def rabbitmq_source(subscription: mrc.Subscription, host: str, exchange: str, @@ -69,13 +69,15 @@ def rabbitmq_source(subscription: mrc.Subscription, poll_interval = pd.Timedelta(poll_interval) + df_pkg = get_df_pkg() + try: while subscription.is_subscribed(): (method_frame, _, body) = channel.basic_get(queue_name) if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) diff --git a/examples/developer_guide/2_2_rabbitmq/read_simple.py b/examples/developer_guide/2_2_rabbitmq/read_simple.py index 2b26d2ba6a..c00e0728ed 100755 --- a/examples/developer_guide/2_2_rabbitmq/read_simple.py +++ b/examples/developer_guide/2_2_rabbitmq/read_simple.py @@ -22,6 +22,7 @@ from morpheus.common import FileTypes from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -33,12 +34,20 @@ is_flag=True, default=False, help="Use the function based version of the RabbitMQ source stage instead of the class") -def run_pipeline(use_source_function: bool): +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) +@click.option( + "--num_threads", + default=len(os.sched_getaffinity(0)), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +def run_pipeline(use_source_function: bool, use_cpu_only: bool, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) config = Config() - config.num_threads = len(os.sched_getaffinity(0)) + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + config.num_threads = num_threads # Create a linear pipeline object pipeline = LinearPipeline(config) diff --git a/examples/developer_guide/2_2_rabbitmq/write_simple.py b/examples/developer_guide/2_2_rabbitmq/write_simple.py index 78fa2c3d26..f2e6e76430 100755 --- a/examples/developer_guide/2_2_rabbitmq/write_simple.py +++ b/examples/developer_guide/2_2_rabbitmq/write_simple.py @@ -16,23 +16,34 @@ import logging import os +import click from write_to_rabbitmq_stage import WriteToRabbitMQStage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.input.file_source_stage import FileSourceStage from morpheus.utils.logger import configure_logging -def run_pipeline(): +@click.command() +@click.option('--input_file', + type=click.Path(exists=True, readable=True), + default=os.path.join(os.environ['MORPHEUS_ROOT'], 'examples/data/email.jsonlines')) +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) +@click.option( + "--num_threads", + default=len(os.sched_getaffinity(0)), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +def run_pipeline(use_cpu_only: bool, input_file: str, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) - root_dir = os.environ['MORPHEUS_ROOT'] - input_file = os.path.join(root_dir, 'examples/data/email.jsonlines') - config = Config() - config.num_threads = len(os.sched_getaffinity(0)) + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + config.num_threads = num_threads # Create a linear pipeline object pipeline = LinearPipeline(config) diff --git a/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py b/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py index 401d8b785e..fb5382eda6 100644 --- a/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py +++ b/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py @@ -22,6 +22,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -29,7 +30,7 @@ @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Source stage used to load messages from a RabbitMQ queue. diff --git a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py index 3b71aa727f..9ea9d1d8f6 100644 --- a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py +++ b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py @@ -21,13 +21,14 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema @register_stage("pass-thru") -class PassThruStage(PassThruTypeMixin, SinglePortStage): +class PassThruStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): def __init__(self, config: Config): super().__init__(config) diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py index 453041534c..752ee0fb01 100755 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py @@ -21,20 +21,20 @@ import pandas as pd import pika -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_utils import get_df_pkg logger = logging.getLogger(__name__) @register_stage("from-rabbitmq") -class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): +class RabbitMQSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Source stage used to load messages from a RabbitMQ queue. @@ -72,6 +72,9 @@ def __init__(self, self._poll_interval = pd.Timedelta(poll_interval) + # This will return either cudf.DataFrame or pandas.DataFrame depending on the execution mode + self._df_pkg = get_df_pkg(config.execution_mode) + @property def name(self) -> str: return "from-rabbitmq" @@ -119,7 +122,7 @@ def source_generator(self, subscription: mrc.Subscription): if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = self._df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py index 401d8b785e..fb5382eda6 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py @@ -22,6 +22,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -29,7 +30,7 @@ @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Source stage used to load messages from a RabbitMQ queue. diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py index b8271bb79a..66d5ffd76b 100755 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py @@ -21,7 +21,7 @@ from morpheus.common import FileTypes from morpheus.config import Config -from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -29,20 +29,19 @@ @click.command() -@click.option('--use_cpp', default=True) +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), type=click.IntRange(min=1), help="Number of internal pipeline threads to use", ) -def run_pipeline(use_cpp, num_threads): +def run_pipeline(use_cpu_only: bool, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(use_cpp) - config = Config() + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU config.num_threads = num_threads # Create a linear pipeline object diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py index b9cdf761e5..a4954d8ae1 100755 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py @@ -16,23 +16,34 @@ import logging import os +import click from rabbitmq_cpp_stage.write_to_rabbitmq_stage import WriteToRabbitMQStage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.input.file_source_stage import FileSourceStage from morpheus.utils.logger import configure_logging -def run_pipeline(): +@click.command() +@click.option('--input_file', + type=click.Path(exists=True, readable=True), + default=os.path.join(os.environ['MORPHEUS_ROOT'], 'examples/data/email.jsonlines')) +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) +@click.option( + "--num_threads", + default=len(os.sched_getaffinity(0)), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +def run_pipeline(use_cpu_only: bool, input_file: str, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) - root_dir = os.environ['MORPHEUS_ROOT'] - input_file = os.path.join(root_dir, 'examples/data/email.jsonlines') - config = Config() - config.num_threads = len(os.sched_getaffinity(0)) + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + config.num_threads = num_threads # Create a linear pipeline object pipeline = LinearPipeline(config) diff --git a/examples/digital_fingerprinting/production/dfp_azure_pipeline.py b/examples/digital_fingerprinting/production/dfp_azure_pipeline.py index dab4122ebd..d470217b83 100644 --- a/examples/digital_fingerprinting/production/dfp_azure_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_azure_pipeline.py @@ -32,7 +32,6 @@ from morpheus.common import FilterSource from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -230,9 +229,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/production/dfp_duo_pipeline.py b/examples/digital_fingerprinting/production/dfp_duo_pipeline.py index c1e3e00495..2cd08bfb7b 100644 --- a/examples/digital_fingerprinting/production/dfp_duo_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_duo_pipeline.py @@ -32,7 +32,6 @@ from morpheus.common import FilterSource from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -227,9 +226,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py b/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py index 5e857929f7..7782961760 100644 --- a/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py @@ -74,12 +74,6 @@ default="60d", help="The training duration to run starting from start_time", ) -@click.option( - "--use_cpp", - type=click.BOOL, - default=True, - help=("Indicates what type of logs are going to be used in the workload."), -) @click.option( "--cache_dir", type=str, @@ -135,7 +129,6 @@ def run_pipeline(source: str, sample_rate_s: int, tracking_uri, silence_monitors, - use_cpp, mlflow_experiment_name_template, mlflow_model_name_template, **kwargs): @@ -167,7 +160,7 @@ def run_pipeline(source: str, # Default timestamp column -- override with ControlMessage timestamp_column_name = "timestamp" - config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name, use_cpp=use_cpp) + config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name) # Construct the data frame Schema used to normalize incoming data schema_builder = SchemaBuilder(config, source) diff --git a/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py b/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py index 587dc81358..198bfa528d 100644 --- a/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py @@ -74,12 +74,6 @@ default="60d", help="The training duration to run starting from start_time", ) -@click.option( - "--use_cpp", - type=click.BOOL, - default=True, - help=("Indicates what type of logs are going to be used in the workload."), -) @click.option( "--cache_dir", type=str, @@ -147,7 +141,6 @@ def run_pipeline(source: str, sample_rate_s: int, tracking_uri, silence_monitors, - use_cpp, mlflow_experiment_name_template, mlflow_model_name_template, **kwargs): @@ -180,7 +173,7 @@ def run_pipeline(source: str, # Default timestamp column -- override with ControlMessage timestamp_column_name = "timestamp" - config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name, use_cpp=use_cpp) + config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name) # Construct the data frame Schema used to normalize incoming data schema_builder = SchemaBuilder(config, source) diff --git a/examples/digital_fingerprinting/production/grafana/run.py b/examples/digital_fingerprinting/production/grafana/run.py index 47d8e927d5..c62c0de1c6 100644 --- a/examples/digital_fingerprinting/production/grafana/run.py +++ b/examples/digital_fingerprinting/production/grafana/run.py @@ -34,7 +34,6 @@ from morpheus.common import FilterSource from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -242,9 +241,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md b/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md index e43755094e..f27e29cdd4 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md @@ -38,14 +38,9 @@ In the `/workspace` directory of the container, run the following to compile Mor ./scripts/compile.sh ``` -Now install Morpheus: -```bash -pip install -e /workspace -``` - Install additional required dependencies: ```bash -mamba env update \ +conda env update --solver=libmamba \ -n ${CONDA_DEFAULT_ENV} \ --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` @@ -87,8 +82,7 @@ Morpheus pipeline configurations for each workflow are managed using [pipelines_ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, ... ``` diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py b/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py index 480893e8b8..d8a1825b72 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py @@ -100,7 +100,6 @@ def _create_config(self): config = generate_ae_config(source=(self._pipe_conf.get('source')), userid_column_name=(self._pipe_conf.get('userid_column_name')), timestamp_column_name=(self._pipe_conf.get('timestamp_column_name')), - use_cpp=(self._pipe_conf.get('use_cpp')), pipeline_batch_size=(self._pipe_conf.get('pipeline_batch_size')), edge_buffer_size=(self._pipe_conf.get('edge_buffer_size')), num_threads=(self._pipe_conf.get('num_threads'))) diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json b/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json index a15edde34f..6049014497 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json @@ -9,8 +9,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_payload_lti_e2e": { "message_path": "../control_messages/azure_payload_lti.json", @@ -21,8 +20,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_payload_lti_s3_e2e": { "message_path": "../control_messages/azure_payload_lti_s3.json", @@ -33,8 +31,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_payload_training_e2e": { "message_path": "../control_messages/azure_payload_training.json", @@ -45,8 +42,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_streaming_inference_e2e": { "message_path": "../control_messages/azure_streaming_inference.json", @@ -57,8 +53,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_streaming_lti_e2e": { "message_path": "../control_messages/azure_streaming_lti.json", @@ -69,8 +64,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_streaming_training_e2e": { "message_path": "../control_messages/azure_streaming_training.json", @@ -81,8 +75,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_duo_payload_inference_e2e": { "message_path": "../control_messages/duo_payload_inference.json", @@ -93,8 +86,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_payload_lti_e2e": { "message_path": "../control_messages/duo_payload_lti.json", @@ -105,8 +97,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_payload_only_load_e2e": { "message_path": "../control_messages/duo_payload_only_load.json", @@ -117,8 +108,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_payload_training_e2e": { "message_path": "../control_messages/duo_payload_training.json", @@ -129,8 +119,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_inference_e2e": { "message_path": "../control_messages/duo_streaming_inference.json", @@ -141,8 +130,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_lti_e2e": { "message_path": "../control_messages/duo_streaming_lti.json", @@ -153,8 +141,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_only_load_e2e": { "message_path": "../control_messages/duo_streaming_only_load.json", @@ -165,8 +152,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_payload_e2e": { "message_path": "../control_messages/duo_streaming_payload.json", @@ -177,8 +163,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_training_e2e": { "message_path": "../control_messages/duo_streaming_training.json", @@ -189,8 +174,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_stages_azure_training_e2e": { "glob_path": "../../../../data/dfp/azure-training-data/*.json", @@ -201,8 +185,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": false + "source": "azure" }, "test_dfp_stages_azure_inference_e2e": { "glob_path": "../../../../data/dfp/azure-inference-data/*.json", @@ -213,8 +196,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": false + "source": "azure" }, "test_dfp_stages_duo_training_e2e": { "glob_path": "../../../../data/dfp/duo-training-data/*.json", @@ -225,8 +207,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": false + "source": "duo" }, "test_dfp_stages_duo_inference_e2e": { "glob_path": "../../../../data/dfp/duo-inference-data/*.json", @@ -237,7 +218,6 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": false + "source": "duo" } } diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb index 39be0c336f..7047b9003c 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb @@ -65,7 +65,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", "from morpheus.utils.column_info import ColumnInfo\n", @@ -194,9 +193,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb index 0002a318b8..3377fb2158 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb @@ -187,8 +187,7 @@ "config: Config = generate_ae_config(\n", " source,\n", " userid_column_name=\"username\",\n", - " timestamp_column_name=\"timestamp\",\n", - " use_cpp=True,\n", + " timestamp_column_name=\"timestamp\"\n", ")\n", "\n", "# Construct the dataframe Schema which is used to normalize incoming azure logs\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb index 4547dea6e9..a30d892b5e 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb @@ -62,7 +62,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.utils.column_info import ColumnInfo\n", "from morpheus.utils.column_info import DataFrameInputSchema\n", @@ -191,9 +190,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb index 675952b652..c407b5caef 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb @@ -63,7 +63,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", "from morpheus.utils.column_info import BoolColumn\n", @@ -193,9 +192,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb index 086786e9a1..60fbd83b5b 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb @@ -190,7 +190,6 @@ " source,\n", " userid_column_name=\"username\",\n", " timestamp_column_name=\"timestamp\",\n", - " use_cpp=True,\n", ")\n", "\n", "# Construct the dataframe Schema which is used to normalize incoming duo logs\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb index a0a30e2c07..35a4fa02d5 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb @@ -62,7 +62,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.utils.column_info import BoolColumn\n", "from morpheus.utils.column_info import ColumnInfo\n", @@ -192,9 +191,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/starter/README.md b/examples/digital_fingerprinting/starter/README.md index 89c2e60a66..30e96f3011 100644 --- a/examples/digital_fingerprinting/starter/README.md +++ b/examples/digital_fingerprinting/starter/README.md @@ -31,69 +31,60 @@ DFP pipelines can be constructed and run using the Morpheus CLI command `morpheu Use `--help` to display information about the autoencoder pipeline command line options: ``` -morpheus run pipeline-ae --help +Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 [ARGS]...]... -Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 - [ARGS]...]... + Configure and run the pipeline. To configure the pipeline, list the stages in the order that data should flow. The output of each stage will become the input for the next + stage. For example, to read, classify and write to a file, the following stages could be used - Configure and run the pipeline. To configure the pipeline, list the stages - in the order that data should flow. The output of each stage will become the - input for the next stage. For example, to read, classify and write to a - file, the following stages could be used + pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model --server_url=localhost:8001 filter --threshold=0.5 to-file + --filename=classifications.json - pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model - --server_url=localhost:8001 filter --threshold=0.5 to-file --filename=classifications.json - - Pipelines must follow a few rules: - 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` - 2. A `deserialize` stage must be placed between the source stages and the rest of the pipeline - 3. Only one inference stage can be used. Zero is also fine - 4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz` + Pipelines must follow a few rules: 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` 2. A `deserialize` stage must be placed + between the source stages and the rest of the pipeline 3. Only one inference stage can be used. Zero is also fine 4. The following stages must come after an inference + stage: `add-class`, `filter`, `gen-viz` Options: - --columns_file FILE [default: ./morpheus/data/columns_ae_cloudtrail.txt] - --labels_file FILE Specifies a file to read labels from in - order to convert class IDs into labels. A - label file is a simple text file where each - line corresponds to a label. If unspecified, - only a single output label is created for - FIL - --userid_column_name TEXT Which column to use as the User ID. - [default: userIdentityaccountId; required] - --userid_filter TEXT Specifying this value will filter all - incoming data to only use rows with matching - User IDs. Which column is used for the User - ID is specified by `userid_column_name` - --feature_scaler TEXT Autoencoder feature scaler [default: - standard] - --use_generic_model BOOLEAN Whether to use a generic model when user does - not have minimum number of training rows - [default: False] - --viz_file FILE Save a visualization of the pipeline at the - specified location + --columns_file DATA FILE Specifies a file to read column features. [required] + --labels_file DATA FILE Specifies a file to read labels from in order to convert class IDs into labels. A label file is a simple text file where each line + corresponds to a label. + --userid_column_name TEXT Which column to use as the User ID. [default: userIdentityaccountId; required] + --userid_filter TEXT Specifying this value will filter all incoming data to only use rows with matching User IDs. Which column is used for the User ID is + specified by `userid_column_name` + --feature_scaler [NONE|STANDARD|GAUSSRANK] + Autoencoder feature scaler [default: STANDARD] + --use_generic_model Whether to use a generic model when user does not have minimum number of training rows + --viz_file FILE Save a visualization of the pipeline at the specified location + --viz_direction [BT|LR|RL|TB] Set the direction for the Graphviz pipeline diagram, ignored unless --viz_file is also specified. [default: LR] + --timestamp_column_name TEXT Which column to use as the timestamp. [default: timestamp; required] --help Show this message and exit. Commands: - add-class Add detected classifications to each message - add-scores Add probability scores to each message - buffer (Deprecated) Buffer results - delay (Deprecated) Delay results for a certain duration - filter Filter message by a classification threshold - from-azure Source stage is used to load Azure Active Directory messages. - from-cloudtrail Load messages from a CloudTrail directory - from-duo Source stage is used to load Duo Authentication messages. - gen-viz (Deprecated) Write out visualization data frames - inf-pytorch Perform inference with PyTorch - inf-triton Perform inference with Triton - monitor Display throughput numbers at a specific point in the - pipeline - preprocess Convert messages to tokens - serialize Include & exclude columns from messages - timeseries Perform time series anomaly detection and add prediction. - to-file Write all messages to a file - to-kafka Write all messages to a Kafka cluster - train-ae Deserialize source data from JSON - validate Validates pipeline output against an expected output + add-class Add detected classifications to each message. + add-scores Add probability scores to each message. + buffer (Deprecated) Buffer results. + delay (Deprecated) Delay results for a certain duration. + filter Filter message by a classification threshold. + from-arxiv Source stage that downloads PDFs from arxiv and converts them to dataframes. + from-azure Source stage is used to load Azure Active Directory messages. + from-cloudtrail Load messages from a CloudTrail directory. + from-databricks-deltalake Source stage used to load messages from a DeltaLake table. + from-duo Source stage is used to load Duo Authentication messages. + from-http Source stage that starts an HTTP server and listens for incoming requests on a specified endpoint. + from-http-client Source stage that polls a remote HTTP server for incoming data. + from-rss Load RSS feed items into a DataFrame. + inf-pytorch Perform inference with PyTorch. + monitor Display throughput numbers at a specific point in the pipeline. + preprocess Prepare Autoencoder input DataFrames for inference. + serialize Includes & excludes columns from messages. + timeseries Perform time series anomaly detection and add prediction. + to-elasticsearch This class writes the messages as documents to Elasticsearch. + to-file Write all messages to a file. + to-http Write all messages to an HTTP endpoint. + to-http-server Sink stage that starts an HTTP server and listens for incoming requests on a specified endpoint. + to-kafka Write all messages to a Kafka cluster. + train-ae Train an Autoencoder model on incoming data. + trigger Buffer data until the previous stage has completed. + validate Validate pipeline output for testing. ``` The commands above correspond to the Morpheus stages that can be used to construct your DFP pipeline. Options are available to configure pipeline and stages. The following table shows mapping between the main Morpheus CLI commands and underlying Morpheus Python stage classes: @@ -160,9 +151,9 @@ Run the following in your Morpheus container to start the CloudTrail DFP pipelin ``` morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ + run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 \ pipeline-ae \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ + --columns_file=data/columns_ae_cloudtrail.txt \ --userid_column_name=userIdentitysessionContextsessionIssueruserName \ --userid_filter=user123 \ --feature_scaler=standard \ @@ -186,9 +177,9 @@ The following pipeline trains user models from downloaded training data and save on downloaded inference data. Inference results are written to `duo-detections.csv`. ``` morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ + run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 \ pipeline-ae \ - --columns_file=morpheus/data/columns_ae_duo.txt \ + --columns_file=data/columns_ae_duo.txt \ --userid_column_name=username \ --feature_scaler=standard \ from-duo \ @@ -211,9 +202,9 @@ morpheus --log_level=DEBUG \ The following example shows how we can load pre-trained user models from the file (`models/dfp-models/duo_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. ``` morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ + run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 \ pipeline-ae \ - --columns_file=morpheus/data/columns_ae_duo.txt \ + --columns_file=data/columns_ae_duo.txt \ --userid_column_name=username \ --feature_scaler=standard \ from-duo \ @@ -260,9 +251,9 @@ morpheus --log_level=DEBUG \ The following example shows how we can load pre-trained user models from the file (`models/dfp-models/azure_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. ``` morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ + run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 \ pipeline-ae \ - --columns_file=morpheus/data/columns_ae_azure.txt \ + --columns_file=data/columns_ae_azure.txt \ --userid_column_name=userPrincipalName \ --feature_scaler=standard \ from-azure \ @@ -287,7 +278,7 @@ run the example. Train user models from files in `models/datasets/training-data/dfp-cloudtrail-*.csv` and saves user models to file. Pipeline then uses these models to run inference on CloudTrail validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. ``` python ./examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ + --columns_file=data/columns_ae_cloudtrail.txt \ --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ --train_data_glob=models/datasets/training-data/dfp-*.csv \ --models_output_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ @@ -297,7 +288,7 @@ python ./examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py \ Here we load pre-trained user models from the file (`models/dfp-models/cloudtrail_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. ``` python ./examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ + --columns_file=data/columns_ae_cloudtrail.txt \ --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ --pretrained_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ --output_file=./cloudtrail-dfp-results.csv diff --git a/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py b/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py index 835b3e2809..ef4f00b7dc 100644 --- a/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py +++ b/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py @@ -18,10 +18,11 @@ import click +from morpheus.cli.utils import MorpheusRelativePath +from morpheus.common import TypeId from morpheus.config import AEFeatureScalar from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage @@ -58,7 +59,7 @@ ) @click.option( "--columns_file", - type=click.Path(exists=True, readable=True), + type=MorpheusRelativePath(exists=True, readable=True), required=True, help="Feature columns file", ) @@ -101,8 +102,6 @@ def run_pipeline(num_threads, """Configure and run the pipeline.""" configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(False) - config = Config() config.mode = PipelineModes.AE config.ae = ConfigAutoEncoder() @@ -137,7 +136,7 @@ def run_pipeline(num_threads, pipeline.add_stage(AutoEncoderInferenceStage(config)) # Add anomaly scores and z-scores to each message - pipeline.add_stage(AddScoresStage(config)) + pipeline.add_stage(AddScoresStage(config, probs_type=TypeId.FLOAT64)) # Add serialize stage pipeline.add_stage(SerializeStage(config)) diff --git a/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py b/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py index 09d6304042..0143dcc6c6 100644 --- a/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py +++ b/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py @@ -30,7 +30,6 @@ from morpheus.common import FileTypes from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.utils.column_info import ColumnInfo @@ -180,9 +179,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py b/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py index f039644b77..475e34e245 100644 --- a/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py +++ b/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py @@ -30,7 +30,6 @@ from morpheus.common import FileTypes from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.utils.column_info import BoolColumn @@ -183,9 +182,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/doca/run_tcp.py b/examples/doca/run_tcp.py index 5c4b4035a7..cf2e797efc 100644 --- a/examples/doca/run_tcp.py +++ b/examples/doca/run_tcp.py @@ -17,7 +17,6 @@ import click from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.doca.doca_convert_stage import DocaConvertStage @@ -71,8 +70,6 @@ def run_pipeline(pipeline_batch_size, model_max_batch_size, model_fea_length, ou # Enable the default logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP diff --git a/examples/doca/run_udp_convert.py b/examples/doca/run_udp_convert.py index 52c9b216b7..c88c80ac6c 100644 --- a/examples/doca/run_udp_convert.py +++ b/examples/doca/run_udp_convert.py @@ -19,7 +19,6 @@ from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import parse_log_level from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.messages import RawPacketMessage from morpheus.pipeline.linear_pipeline import LinearPipeline @@ -90,8 +89,6 @@ def run_pipeline(nic_addr: str, # Enable the default logger configure_logging(log_level=log_level) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP diff --git a/examples/doca/run_udp_raw.py b/examples/doca/run_udp_raw.py index 576ecff957..cb31c5bb6c 100644 --- a/examples/doca/run_udp_raw.py +++ b/examples/doca/run_udp_raw.py @@ -17,7 +17,6 @@ import click from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.messages import RawPacketMessage from morpheus.pipeline.linear_pipeline import LinearPipeline @@ -41,8 +40,6 @@ def run_pipeline(nic_addr, gpu_addr): # Enable the default logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP diff --git a/examples/doca/vdb_realtime/vdb.py b/examples/doca/vdb_realtime/vdb.py index 79c9aee420..226213d2f6 100644 --- a/examples/doca/vdb_realtime/vdb.py +++ b/examples/doca/vdb_realtime/vdb.py @@ -18,7 +18,6 @@ import pymilvus from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.doca.doca_convert_stage import DocaConvertStage @@ -119,8 +118,6 @@ def run_pipeline(nic_addr: str, # Enable the default logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP config.pipeline_batch_size = 1024 diff --git a/examples/gnn_fraud_detection_pipeline/run.py b/examples/gnn_fraud_detection_pipeline/run.py index 6a3268f174..27361d05ea 100644 --- a/examples/gnn_fraud_detection_pipeline/run.py +++ b/examples/gnn_fraud_detection_pipeline/run.py @@ -18,7 +18,6 @@ import click from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage @@ -100,8 +99,6 @@ def run_pipeline(num_threads, # Enable the default logger. configure_logging(log_level=logging.INFO) - CppConfig.set_should_use_cpp(False) - # Its necessary to get the global config object and configure it for FIL mode. config = Config() config.mode = PipelineModes.OTHER diff --git a/examples/llm/cli.py b/examples/llm/cli.py index 867d670345..3eae3f5ffb 100644 --- a/examples/llm/cli.py +++ b/examples/llm/cli.py @@ -35,23 +35,15 @@ type=click.Choice(get_log_levels(), case_sensitive=False), callback=parse_log_level, help="Specify the logging level to use.") -@click.option('--use_cpp', - default=True, - type=bool, - help=("Whether or not to use C++ node and message types or to prefer python. " - "Only use as a last resort if bugs are encountered")) @click.version_option() @click.pass_context -def cli(ctx: click.Context, log_level: int, use_cpp: bool): +def cli(ctx: click.Context, log_level: int): """Main entrypoint for the LLM Examples""" - from morpheus.config import CppConfig from morpheus.utils.logger import configure_logging ctx_dict = ctx.ensure_object(dict) - CppConfig.set_should_use_cpp(use_cpp) - # Configure the logging configure_logging(log_level=log_level) diff --git a/examples/llm/vdb_upload/run.py b/examples/llm/vdb_upload/run.py index f02ed5dfe0..b3099d845f 100644 --- a/examples/llm/vdb_upload/run.py +++ b/examples/llm/vdb_upload/run.py @@ -115,7 +115,7 @@ def run(): @click.option( "--triton_server_url", type=str, - default="localhost:8001", + default="localhost:8000", help="Triton server URL.", ) @click.option( diff --git a/examples/log_parsing/inference.py b/examples/log_parsing/inference.py index 67f4062409..099928cff9 100644 --- a/examples/log_parsing/inference.py +++ b/examples/log_parsing/inference.py @@ -19,7 +19,6 @@ import tritonclient.grpc as tritonclient from scipy.special import softmax -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes @@ -62,7 +61,7 @@ def build_output_message(self, msg: ControlMessage) -> ControlMessage: seq_ids[:, 0] = cp.arange(0, msg.tensors().count, dtype=cp.uint32) seq_ids[:, 2] = msg.tensors().get_tensor('seq_ids')[:, 2] - memory = _messages.TensorMemory( + memory = TensorMemory( count=msg.tensors().count, tensors={ 'confidences': cp.zeros((msg.tensors().count, self._inputs[list(self._inputs.keys())[0]].shape[1])), diff --git a/examples/ransomware_detection/README.md b/examples/ransomware_detection/README.md index 0388140227..0619af26ec 100644 --- a/examples/ransomware_detection/README.md +++ b/examples/ransomware_detection/README.md @@ -68,7 +68,7 @@ Once Triton server finishes starting up, it will display the status of all loade Run the following from the root of the Morpheus repo to start the ransomware detection pipeline: ```bash -python examples/ransomware_detection/run.py --server_url=localhost:8001 \ +python examples/ransomware_detection/run.py --server_url=localhost:8000 \ --sliding_window=3 \ --model_name=ransomw-model-short-rf \ --input_glob=./examples/data/appshield/*/snapshot-*/*.json \ diff --git a/examples/ransomware_detection/run.py b/examples/ransomware_detection/run.py index a89c7c93f2..a94fe301f6 100644 --- a/examples/ransomware_detection/run.py +++ b/examples/ransomware_detection/run.py @@ -20,7 +20,6 @@ import yaml from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage @@ -39,7 +38,6 @@ @click.command() @click.option('--debug', default=False) -@click.option('--use_cpp', default=False, help="Enable C++ execution for this pipeline, currently this is unsupported.") @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), @@ -102,7 +100,6 @@ help="The path to the file where the inference output will be saved.", ) def run_pipeline(debug, - use_cpp, num_threads, n_dask_workers, threads_per_dask_worker, @@ -122,8 +119,6 @@ def run_pipeline(debug, snapshot_fea_length = 99 - CppConfig.set_should_use_cpp(use_cpp) - # Its necessary to get the global config object and configure it for FIL mode. config = Config() config.mode = PipelineModes.FIL @@ -205,6 +200,7 @@ def run_pipeline(debug, model_name=model_name, server_url=server_url, force_convert_inputs=True, + thread_count=1 # Work-around for issue #1891 remove once resolved. )) # Add a monitor stage. diff --git a/examples/ransomware_detection/stages/create_features.py b/examples/ransomware_detection/stages/create_features.py index 862747d9a6..3ca214caad 100644 --- a/examples/ransomware_detection/stages/create_features.py +++ b/examples/ransomware_detection/stages/create_features.py @@ -12,38 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. -import typing - import mrc +import pandas as pd from mrc.core import operators as ops from dask.distributed import Client +import cudf + from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.control_message_stage import ControlMessageStage -from morpheus.stages.input.appshield_source_stage import AppShieldMessageMeta +from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from common.data_models import FeatureConfig # pylint: disable=no-name-in-module # isort: skip from common.feature_extractor import FeatureExtractor # pylint: disable=no-name-in-module # isort: skip @register_stage("create-features", modes=[PipelineModes.FIL]) -class CreateFeaturesRWStage(ControlMessageStage): +class CreateFeaturesRWStage(PreallocatorMixin, ControlMessageStage): """ - This class extends ControlMessageStage to deal with scenario specific features from Appshiled plugins data. + Stage creates features from Appshiled plugins data. Parameters ---------- c : morpheus.config.Config Pipeline configuration instance - interested_plugins : typing.List[str] + interested_plugins : list[str] Only intrested plugins files will be read from Appshield snapshots - feature_columns : typing.List[str] + feature_columns : list[str] List of features needed to be extracted. - file_extns : typing.List[str] + file_extns : list[str] File extensions. n_workers: int, default = 2 Number of dask workers. @@ -54,9 +56,9 @@ class CreateFeaturesRWStage(ControlMessageStage): def __init__( self, c: Config, - interested_plugins: typing.List[str], - feature_columns: typing.List[str], - file_extns: typing.List[str], + interested_plugins: list[str], + feature_columns: list[str], + file_extns: list[str], n_workers: int = 2, threads_per_worker: int = 2, ): @@ -73,20 +75,23 @@ def __init__( def name(self) -> str: return "create-features-rw" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Returns accepted input types for this stage. """ - return (AppShieldMessageMeta, ) + return (ControlMessage, ) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def on_next(self, x: AppShieldMessageMeta): + def on_next(self, msg: ControlMessage) -> list[ControlMessage]: snapshot_fea_dfs = [] - df = x.df + with msg.payload().mutable_dataframe() as cdf: + df = cdf.to_pandas() + + msg_source = msg.get_metadata("source") # Type cast CommitCharge. df["CommitCharge"] = df["CommitCharge"].astype("float").astype("Int32") @@ -118,25 +123,23 @@ def on_next(self, x: AppShieldMessageMeta): # Snapshot sequence will be generated using `source_pid_process`. # Determines which source generated the snapshot messages. # There's a chance of receiving the same snapshots names from multiple sources(hosts) - features_df['source_pid_process'] = x.source + '_' + features_df.pid_process + features_df['source_pid_process'] = msg_source + '_' + features_df.pid_process + + # Cast int values to string preventing the df from converting to cuDF. + features_df['ldrmodules_df_path'] = features_df['ldrmodules_df_path'].astype(str) # Sort entries by pid_process and snapshot_id features_df = features_df.sort_values(by=["pid_process", "snapshot_id"]).reset_index(drop=True) - # Create AppShieldMessageMeta with extracted features information. - meta = AppShieldMessageMeta(features_df, x.source) + return self.split_messages(msg_source, features_df) - return meta + def split_messages(self, msg_source: str, df: pd.DataFrame) -> list[ControlMessage]: - def create_control_messages(self, app_shield_message_meta: AppShieldMessageMeta) -> typing.List[ControlMessage]: - - control_messages = [] - - df = app_shield_message_meta.df + output_messages = [] pid_processes = df.pid_process.unique() - # Create multi messaage per pid_process, this assumes that the DF has been sorted by the `pid_process` column + # Create a unique messaage per pid_process, this assumes the DF has been sorted by the `pid_process` column for pid_process in pid_processes: pid_process_index = df[df.pid_process == pid_process].index @@ -144,13 +147,15 @@ def create_control_messages(self, app_shield_message_meta: AppShieldMessageMeta) start = pid_process_index.min() stop = pid_process_index.max() + 1 - sliced_meta = app_shield_message_meta.get_slice(start, stop) - control_message = ControlMessage() - control_message.payload(sliced_meta) + cdf = cudf.DataFrame(df.iloc[start:stop]) + + out_msg = ControlMessage() + out_msg.payload(MessageMeta(cdf)) + out_msg.set_metadata("source", msg_source) - control_messages.append(control_message) + output_messages.append(out_msg) - return control_messages + return output_messages def on_completed(self): # Close dask client when pipeline initiates shutdown @@ -159,7 +164,6 @@ def on_completed(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: node = builder.make_node(self.unique_name, ops.map(self.on_next), - ops.map(self.create_control_messages), ops.on_completed(self.on_completed), ops.flatten()) builder.make_edge(input_node, node) diff --git a/examples/ransomware_detection/stages/preprocessing.py b/examples/ransomware_detection/stages/preprocessing.py index 3715f92425..68f6c8bc85 100644 --- a/examples/ransomware_detection/stages/preprocessing.py +++ b/examples/ransomware_detection/stages/preprocessing.py @@ -18,12 +18,12 @@ import mrc import pandas as pd -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemoryFIL from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage from common.data_models import SnapshotData # pylint: disable=no-name-in-module #isort:skip @@ -39,13 +39,13 @@ class PreprocessingRWStage(PreprocessBaseStage): ---------- c : morpheus.config.Config Pipeline configuration instance - feature_columns : typing.List[str] + feature_columns : list[str] List of features needed to be extracted. sliding_window: int, default = 3 Window size to arrange the sanpshots in seequential order. """ - def __init__(self, c: Config, feature_columns: typing.List[str], sliding_window: int = 3): + def __init__(self, c: Config, feature_columns: list[str], sliding_window: int = 3): super().__init__(c) @@ -54,7 +54,7 @@ def __init__(self, c: Config, feature_columns: typing.List[str], sliding_window: self._features_len = len(self._feature_columns) # Stateful member to hold unprocessed snapshots. - self._snapshot_dict: typing.Dict[str, typing.List[SnapshotData]] = {} + self._snapshot_dict: dict[str, list[SnapshotData]] = {} # Padding data to map inference response with input messages. self._padding_data = [0 for i in range(self._features_len * sliding_window)] @@ -64,11 +64,10 @@ def __init__(self, c: Config, feature_columns: typing.List[str], sliding_window: def name(self) -> str: return "preprocess-rw" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def _sliding_window_offsets(self, ids: typing.List[int], ids_len: int, - window: int) -> typing.List[typing.Tuple[int]]: + def _sliding_window_offsets(self, ids: list[int], ids_len: int, window: int) -> list[tuple[int]]: """ Create snapshot_id's sliding sequence for a given window """ @@ -86,10 +85,7 @@ def _sliding_window_offsets(self, ids: typing.List[int], ids_len: int, return sliding_window_offsets - def _rollover_pending_snapshots(self, - snapshot_ids: typing.List[int], - source_pid_process: str, - snapshot_df: pd.DataFrame): + def _rollover_pending_snapshots(self, snapshot_ids: list[int], source_pid_process: str, snapshot_df: pd.DataFrame): """ Store the unprocessed snapshots from current run to a stateful member to process them in the next run. """ @@ -130,7 +126,9 @@ def _pre_process_batch(self, msg: ControlMessage) -> ControlMessage: Current run's unprocessed snapshots will be rolled over to the next. """ - snapshot_df = msg.payload().df + meta = msg.payload() + snapshot_df = meta.copy_dataframe().to_pandas() + curr_snapshots_size = len(snapshot_df) # Set snapshot_id as index this is used to get ordered snapshots based on sliding window. @@ -174,19 +172,18 @@ def _pre_process_batch(self, msg: ControlMessage) -> ControlMessage: self._rollover_pending_snapshots(snapshot_ids, source_pid_process, snapshot_df) # This column is used to identify whether sequence is genuine or dummy - msg.payload().set_data('sequence', sequence) + meta.set_data('sequence', sequence) # Convert data to cupy array data = cp.asarray(data) - seg_ids = cp.zeros((curr_snapshots_size, 3), dtype=cp.uint32) - seg_ids[:, 0] = cp.arange(0, curr_snapshots_size, dtype=cp.uint32) - seg_ids[:, 2] = self._features_len * 3 + seq_ids = cp.zeros((curr_snapshots_size, 3), dtype=cp.uint32) + seq_ids[:, 0] = cp.arange(0, curr_snapshots_size, dtype=cp.uint32) + seq_ids[:, 2] = self._features_len * 3 - memory = _messages.InferenceMemoryFIL(count=curr_snapshots_size, input__0=data, seq_ids=seg_ids) - msg.tensors(memory) + memory = InferenceMemoryFIL(count=curr_snapshots_size, input__0=data, seq_ids=seq_ids) msg.set_metadata("inference_memory_params", {"inference_type": "fil"}) - + msg.tensors(memory) return msg def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: diff --git a/examples/sid_visualization/README.md b/examples/sid_visualization/README.md index 10aeb4cbee..c1d88b25b4 100644 --- a/examples/sid_visualization/README.md +++ b/examples/sid_visualization/README.md @@ -96,7 +96,7 @@ After the GUI has been launched, Morpheus now needs to be started. In the same s ```bash python examples/sid_visualization/run.py \ --debug \ - --triton_server_url=triton:8001 \ + --triton_server_url=triton:8000 \ --input_file=./examples/data/sid_visualization/group1-benign-2nodes.jsonlines \ --input_file=./examples/data/sid_visualization/group2-benign-50nodes.jsonlines \ --input_file=./examples/data/sid_visualization/group3-si-50nodes.jsonlines \ @@ -147,7 +147,7 @@ morpheus --log_level=DEBUG \ pipeline-nlp --model_seq_length=256 \ from-file --filename=${DEMO_DATASET} \ deserialize \ - preprocess --vocab_hash_file=morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \ + preprocess --vocab_hash_file=data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \ inf-triton --model_name=sid-minibert-onnx --server_url=triton:8001 --force_convert_inputs=True \ monitor --description Inference\ Rate --unit=inf \ add-class \ diff --git a/examples/sid_visualization/run.py b/examples/sid_visualization/run.py index 4db84fac11..6e58a92791 100644 --- a/examples/sid_visualization/run.py +++ b/examples/sid_visualization/run.py @@ -21,7 +21,6 @@ from morpheus.common import FileTypes from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.io.deserializers import read_file_to_df from morpheus.messages import MessageMeta @@ -120,7 +119,6 @@ def _generate_frames(self): @click.command() @click.option("--debug/--no-debug", default=False) -@click.option('--use_cpp', default=True) @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), @@ -148,16 +146,14 @@ def _generate_frames(self): default="sid-minibert-onnx", help="The name of the model that is deployed on Tritonserver.", ) -@click.option("--triton_server_url", default="localhost:8001", required=True, help="Tritonserver url.") -def run_pipeline(debug, use_cpp, num_threads, input_file, max_batch_size, model_name, triton_server_url): +@click.option("--triton_server_url", default="localhost:8000", required=True, help="Tritonserver url.") +def run_pipeline(debug, num_threads, input_file, max_batch_size, model_name, triton_server_url): if debug: configure_logging(log_level=logging.DEBUG) else: configure_logging(log_level=logging.INFO) - CppConfig.set_should_use_cpp(use_cpp) - # Its necessary to get the global config object and configure it for FIL mode. config = Config() config.mode = PipelineModes.NLP diff --git a/external/utilities b/external/utilities index 722c1352a0..85f8f7af2e 160000 --- a/external/utilities +++ b/external/utilities @@ -1 +1 @@ -Subproject commit 722c1352a0e9b9f606d343714cee88578c12e455 +Subproject commit 85f8f7af2e8d9bc7bde978cd40c40297b1116957 diff --git a/models/model-cards/dfp-model-card.md b/models/model-cards/dfp-model-card.md index 71c0eebc04..88b453d254 100644 --- a/models/model-cards/dfp-model-card.md +++ b/models/model-cards/dfp-model-card.md @@ -52,7 +52,7 @@ The model architecture consists of an Autoencoder, where the reconstruction loss * Reconstruction loss (per feature) **Output Parameters:** -* Pandas DataFrame +* pandas DataFrame ## Software Integration: **Runtime:** diff --git a/morpheus.code-workspace b/morpheus.code-workspace index 0ec937642b..092d5d6d20 100644 --- a/morpheus.code-workspace +++ b/morpheus.code-workspace @@ -304,6 +304,18 @@ "request": "launch", "type": "debugpy" }, + { + "args": [ + "-x" + ], + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + "justMyCode": false, + "module": "pytest", + "name": "Python: tests", + "request": "launch", + "type": "debugpy" + }, { "MIMode": "gdb", "args": [ diff --git a/pyproject.toml b/pyproject.toml index 678c060041..d50c73630f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,8 +15,9 @@ markers = [ "slow: Slow tests", "kafka: Tests that require a running instance of kafka", "milvus: Tests that require a running instance of milvus", - "use_cpp: Test support C++ nodes and objects", - "use_python: Test only supports Python nodes and objects", + "gpu_mode: Test support GPU nodes and objects", + "cpu_mode: Test only supports CPU nodes and objects", + "gpu_and_cpu_mode: Test supports both GPU and CPU nodes and objects", "use_cudf: Test supports cuDF datasets", "use_pandas: Test supports Pandas datasets", "replace_callback: Replaces the results_callback in cli", diff --git a/python/morpheus/morpheus/_lib/common/module.cpp b/python/morpheus/morpheus/_lib/common/module.cpp index c36d349bac..f9ba779f10 100644 --- a/python/morpheus/morpheus/_lib/common/module.cpp +++ b/python/morpheus/morpheus/_lib/common/module.cpp @@ -28,7 +28,6 @@ #include "morpheus/objects/filter_source.hpp" #include "morpheus/objects/tensor_object.hpp" // for TensorObject #include "morpheus/objects/wrapped_tensor.hpp" -#include "morpheus/utilities/cudf_util.hpp" #include "morpheus/utilities/http_server.hpp" #include "morpheus/version.hpp" @@ -58,9 +57,6 @@ PYBIND11_MODULE(common, _module) :toctree: _generate )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - LoaderRegistry::register_factory_fn( "file", [](nlohmann::json config) { diff --git a/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp b/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp index 22e668cfe3..c10bdc4a78 100644 --- a/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp +++ b/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp @@ -24,13 +24,16 @@ #include // for object, dict, list #include // IWYU pragma: keep -#include // for system_clock, time_point +// for system_clock, time_point +#include // IWYU pragma: keep #include // for map #include // for shared_ptr #include // for optional #include // for string #include // for vector +// IWYU pragma: no_include + namespace morpheus { enum class MORPHEUS_EXPORT ControlMessageType @@ -40,128 +43,6 @@ enum class MORPHEUS_EXPORT ControlMessageType TRAINING }; -// class PayloadManager -// { -// public: -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// TensorObject& get_tensor(const std::string& name) -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return const TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// const TensorObject& get_tensor(const std::string& name) const -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Set the tensor object identified by `name` -// * -// * @param name -// * @param tensor -// * @throws std::length_error If the number of rows in `tensor` does not match `count`. -// */ -// void set_tensor(const std::string& name, TensorObject&& tensor) -// { -// m_tensors->set_tensor(name, std::move(tensor)); -// } - -// /** -// * @brief Get a reference to the internal tensors map -// * -// * @return const TensorMap& -// */ -// const TensorMap& get_tensors() const -// { -// return m_tensors->get_tensors(); -// } - -// /** -// * @brief Set the tensors object -// * -// * @param tensors -// * @throws std::length_error If the number of rows in the `tensors` do not match `count`. -// */ -// void set_tensors(TensorMap&& tensors) -// { -// m_tensors->set_tensors(std::move(tensors)); -// } - -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// TensorObject& get_column(const std::string& name) -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return const TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// const TensorObject& get_column(const std::string& name) const -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Set the tensor object identified by `name` -// * -// * @param name -// * @param tensor -// * @throws std::length_error If the number of rows in `tensor` does not match `count`. -// */ -// void set_column(const std::string& name, TensorObject&& tensor) -// { -// m_tensors->set_tensor(name, std::move(tensor)); -// } - -// /** -// * @brief Get a reference to the internal tensors map -// * -// * @return const TensorMap& -// */ -// TableInfo get_columns() const -// { -// return m_df->get_info(); -// } - -// /** -// * @brief Set the tensors object -// * -// * @param tensors -// * @throws std::length_error If the number of rows in the `tensors` do not match `count`. -// */ -// void set_columns(TableInfo&& tensors) -// { -// m_tensors->set_tensors(std::move(tensors)); -// } - -// private: -// std::shared_ptr m_df; -// std::shared_ptr m_tensors; -// }; - class MORPHEUS_EXPORT TensorMemory; // System-clock for better compatibility with pybind11/chrono @@ -369,6 +250,8 @@ class MORPHEUS_EXPORT ControlMessage static const std::string s_config_schema; // NOLINT static std::map s_task_type_map; // NOLINT + ControlMessageType to_task_type(const std::string& task_type, bool throw_on_error) const; + ControlMessageType m_cm_type{ControlMessageType::NONE}; std::shared_ptr m_payload{nullptr}; std::shared_ptr m_tensors{nullptr}; @@ -382,11 +265,13 @@ class MORPHEUS_EXPORT ControlMessage struct MORPHEUS_EXPORT ControlMessageProxy { /** - * @brief Creates a new ControlMessage instance from a configuration dictionary. - * @param config A pybind11::dict representing the configuration for the ControlMessage. + * @brief Creates a new ControlMessage instance from either a Python instance of a ControlMessage or a configuration + * dictionary. + * @param config_or_message Either a Python instance of a ControlMessage or a dict representing the configuration + * for the ControlMessage. * @return A shared_ptr to a newly created ControlMessage instance. */ - static std::shared_ptr create(pybind11::dict& config); + static std::shared_ptr create(pybind11::object& config_or_message); /** * @brief Creates a new ControlMessage instance as a copy of an existing one. diff --git a/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp b/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp index 5eb6636919..7a87620b90 100644 --- a/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp +++ b/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp @@ -40,7 +40,7 @@ namespace morpheus { struct CudfHelper { public: - __attribute__((visibility("default"))) static void load(); + static void load(); /** * @brief Converts a C++ table to a Python DataTable object @@ -67,6 +67,9 @@ struct CudfHelper * @return TableInfoData */ static TableInfoData table_info_data_from_table(pybind11::object table); + + private: + CudfHelper(); }; /** @} */ // end of group diff --git a/python/morpheus/morpheus/_lib/messages/__init__.pyi b/python/morpheus/morpheus/_lib/messages/__init__.pyi index 74d7a522d4..4974b93daf 100644 --- a/python/morpheus/morpheus/_lib/messages/__init__.pyi +++ b/python/morpheus/morpheus/_lib/messages/__init__.pyi @@ -36,7 +36,7 @@ class ControlMessage(): @typing.overload def __init__(self, arg0: ControlMessage) -> None: ... @typing.overload - def __init__(self, arg0: dict) -> None: ... + def __init__(self, arg0: object) -> None: ... def add_task(self, task_type: str, task: object | None) -> None: ... @typing.overload def config(self) -> object | None: ... diff --git a/python/morpheus/morpheus/_lib/messages/module.cpp b/python/morpheus/morpheus/_lib/messages/module.cpp index fdc5fce73b..af559bde58 100644 --- a/python/morpheus/morpheus/_lib/messages/module.cpp +++ b/python/morpheus/morpheus/_lib/messages/module.cpp @@ -29,8 +29,7 @@ #include "morpheus/messages/raw_packet.hpp" #include "morpheus/objects/data_table.hpp" #include "morpheus/objects/mutable_table_ctx_mgr.hpp" -#include "morpheus/pybind11/json.hpp" // IWYU pragma: keep -#include "morpheus/utilities/cudf_util.hpp" +#include "morpheus/pybind11/json.hpp" // IWYU pragma: keep #include "morpheus/utilities/json_types.hpp" // for json_t #include "morpheus/utilities/string_util.hpp" #include "morpheus/version.hpp" @@ -127,10 +126,7 @@ PYBIND11_MODULE(messages, _module) )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - - mrc::pymrc::import(_module, "cupy"); + mrc::pymrc::import(_module, "cupy"); // It should be safe to import cupy in CPU only mode mrc::pymrc::import(_module, "morpheus._lib.common"); // Required for SegmentObject @@ -256,8 +252,8 @@ PYBIND11_MODULE(messages, _module) py::class_>(_module, "ControlMessage") .def(py::init<>()) - .def(py::init(py::overload_cast(&ControlMessageProxy::create))) .def(py::init(py::overload_cast>(&ControlMessageProxy::create))) + .def(py::init(py::overload_cast(&ControlMessageProxy::create))) .def("add_task", &ControlMessage::add_task, py::arg("task_type"), py::arg("task")) .def( "config", py::overload_cast(&ControlMessage::config), py::arg("config")) diff --git a/python/morpheus/morpheus/_lib/src/messages/control.cpp b/python/morpheus/morpheus/_lib/src/messages/control.cpp index ca23c5f9f8..d20334c35a 100644 --- a/python/morpheus/morpheus/_lib/src/messages/control.cpp +++ b/python/morpheus/morpheus/_lib/src/messages/control.cpp @@ -17,20 +17,25 @@ #include "morpheus/messages/control.hpp" -#include "morpheus/messages/meta.hpp" // for MessageMeta, MessageMetaInterfaceProxy - -#include // for COMPACT_GOOGLE_LOG_INFO, LogMessage, VLOG -#include // for basic_json, json_ref, iter_impl, operator<< -#include // IWYU pragma: keep -#include // for cast, object::cast -#include // for object, none, dict, isinstance, list, str, value_error, generic_item -#include // for cast_from_pyobject +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory, TensorMemoryInterfaceProxy +#include "morpheus/messages/meta.hpp" // for MessageMeta, MessageMetaInterfaceProxy +#include "morpheus/types.hpp" // for TensorIndex + +#include // for to_lower_copy +#include // for COMPACT_GOOGLE_LOG_INFO, LogMessage, VLOG +#include // for basic_json, json_ref, iter_impl, operator<< +#include // IWYU pragma: keep +#include // for cast, object::cast +#include // for object, none, dict, isinstance, list, str, value_error, generic_item +#include // IWYU pragma: keep +#include // for cast_from_pyobject #include // for optional, nullopt #include // for basic_ostream, operator<< #include // for regex_search, regex #include // for runtime_error #include // for pair +// IWYU pragma: no_include namespace py = pybind11; using namespace py::literals; @@ -40,6 +45,7 @@ namespace morpheus { const std::string ControlMessage::s_config_schema = R"()"; std::map ControlMessage::s_task_type_map{{"inference", ControlMessageType::INFERENCE}, + {"none", ControlMessageType::NONE}, {"training", ControlMessageType::TRAINING}}; ControlMessage::ControlMessage() : m_config({{"metadata", morpheus::utilities::json_t::object()}}), m_tasks({}) {} @@ -53,8 +59,14 @@ ControlMessage::ControlMessage(const morpheus::utilities::json_t& _config) : ControlMessage::ControlMessage(const ControlMessage& other) { + m_cm_type = other.m_cm_type; + m_payload = other.m_payload; + m_tensors = other.m_tensors; + m_config = other.m_config; m_tasks = other.m_tasks; + + m_timestamps = other.m_timestamps; } const morpheus::utilities::json_t& ControlMessage::config() const @@ -65,16 +77,19 @@ const morpheus::utilities::json_t& ControlMessage::config() const void ControlMessage::add_task(const std::string& task_type, const morpheus::utilities::json_t& task) { VLOG(20) << "Adding task of type " << task_type << " to control message" << task.dump(4); - auto _task_type = s_task_type_map.contains(task_type) ? s_task_type_map[task_type] : ControlMessageType::NONE; - - if (this->task_type() == ControlMessageType::NONE) - { - this->task_type(_task_type); - } + auto _task_type = to_task_type(task_type, false); - if (_task_type != ControlMessageType::NONE and this->task_type() != _task_type) + if (_task_type != ControlMessageType::NONE) { - throw std::runtime_error("Cannot add inference and training tasks to the same control message"); + auto current_task_type = this->task_type(); + if (current_task_type == ControlMessageType::NONE) + { + this->task_type(_task_type); + } + else if (current_task_type != _task_type) + { + throw std::runtime_error("Cannot mix different types of tasks on the same control message"); + } } m_tasks[task_type].push_back(task); @@ -197,14 +212,7 @@ void ControlMessage::config(const morpheus::utilities::json_t& config) { if (config.contains("type")) { - auto task_type = config.at("type"); - auto _task_type = - s_task_type_map.contains(task_type) ? s_task_type_map.at(task_type) : ControlMessageType::NONE; - - if (this->task_type() == ControlMessageType::NONE) - { - this->task_type(_task_type); - } + this->task_type(to_task_type(config.at("type").get(), true)); } if (config.contains("tasks")) @@ -256,10 +264,65 @@ void ControlMessage::task_type(ControlMessageType type) m_cm_type = type; } +ControlMessageType ControlMessage::to_task_type(const std::string& task_type, bool throw_on_error) const +{ + auto lower_task_type = boost::to_lower_copy(task_type); + if (ControlMessage::s_task_type_map.contains(lower_task_type)) + { + return ControlMessage::s_task_type_map.at(lower_task_type); + } + + if (throw_on_error) + { + throw std::runtime_error("Invalid task type: " + task_type); + } + + return ControlMessageType::NONE; +} + /*** Proxy Implementations ***/ -std::shared_ptr ControlMessageProxy::create(py::dict& config) +std::shared_ptr ControlMessageProxy::create(py::object& config_or_message) { - return std::make_shared(mrc::pymrc::cast_from_pyobject(config)); + if (config_or_message.is_none()) + { + return std::make_shared(); + } + + if (py::isinstance(config_or_message)) + { + return std::make_shared(mrc::pymrc::cast_from_pyobject(config_or_message)); + } + + // Assume we received an instance of the Python impl of ControlMessage object, as a Python bound instance of the C++ + // impl of the ControlMessage class would have invoked the shared_ptr overload of the create method + py::dict config = config_or_message.attr("_export_config")(); + auto cm = std::make_shared(mrc::pymrc::cast_from_pyobject(config)); + + auto py_meta = config_or_message.attr("payload")(); + if (!py_meta.is_none()) + { + cm->payload(MessageMetaInterfaceProxy::init_python_meta(py_meta)); + } + + auto py_tensors = config_or_message.attr("tensors")(); + if (!py_tensors.is_none()) + { + auto count = py_tensors.attr("count").cast(); + auto py_tensors_map = py_tensors.attr("get_tensors")(); + cm->tensors(TensorMemoryInterfaceProxy::init(count, py_tensors_map)); + } + + auto py_timestamps = config_or_message.attr("_timestamps"); + if (!py_timestamps.is_none()) + { + auto timestamps_map = py_timestamps.cast>(); + for (const auto& t : timestamps_map) + { + cm->set_timestamp(t.first, t.second); + } + } + + return cm; } std::shared_ptr ControlMessageProxy::create(std::shared_ptr other) diff --git a/python/morpheus/morpheus/_lib/src/messages/meta.cpp b/python/morpheus/morpheus/_lib/src/messages/meta.cpp index 7426bf7a5d..8b37633612 100644 --- a/python/morpheus/morpheus/_lib/src/messages/meta.cpp +++ b/python/morpheus/morpheus/_lib/src/messages/meta.cpp @@ -252,7 +252,7 @@ std::shared_ptr MessageMetaInterfaceProxy::init_python(py::object&& auto cudf_df_cls = py::module_::import("cudf").attr("DataFrame"); if (!py::isinstance(data_frame, cudf_df_cls)) { - // Convert to cudf if it's a Pandas DF, thrown an error otherwise + // Check if we received a Pandas DF or the Python impl of MessageMeta, throw an error otherwise auto pd_df_cls = py::module_::import("pandas").attr("DataFrame"); if (py::isinstance(data_frame, pd_df_cls)) { @@ -265,6 +265,7 @@ std::shared_ptr MessageMetaInterfaceProxy::init_python(py::object&& auto msg_meta_cls = py::module_::import("morpheus.messages").attr("MessageMeta"); if (py::isinstance(data_frame, msg_meta_cls)) { + DVLOG(10) << "Converting from a Python impl of MessageMeta to C++ impl"; return init_python_meta(data_frame); } else diff --git a/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp b/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp index fbc86ad0d2..2e1c98a84d 100644 --- a/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp +++ b/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp @@ -38,7 +38,7 @@ namespace morpheus { -void CudfHelper::load() +CudfHelper::CudfHelper() { // Avoid loading cudf_helpers if we are in a sphinx build if (std::getenv("MORPHEUS_IN_SPHINX_BUILD") == nullptr) @@ -53,14 +53,21 @@ void CudfHelper::load() } } +void CudfHelper::load() +{ + static CudfHelper s; +} + pybind11::object proxy_table_from_table_with_metadata(cudf::io::table_with_metadata&& table, int index_col_count) { + CudfHelper::load(); return pybind11::reinterpret_steal( (PyObject*)make_table_from_table_with_metadata(std::move(table), index_col_count)); } morpheus::TableInfoData proxy_table_info_data_from_table(pybind11::object table) { + CudfHelper::load(); return make_table_info_data_from_table(table.ptr()); } @@ -71,6 +78,7 @@ pybind11::object CudfHelper::table_from_table_with_metadata(cudf::io::table_with pybind11::object CudfHelper::table_from_table_info(const TableInfoBase& table_info) { + CudfHelper::load(); // Get the table info data from the table_into auto table_info_data = table_info.get_data(); diff --git a/python/morpheus/morpheus/_lib/stages/module.cpp b/python/morpheus/morpheus/_lib/stages/module.cpp index 51add3410e..266455177e 100644 --- a/python/morpheus/morpheus/_lib/stages/module.cpp +++ b/python/morpheus/morpheus/_lib/stages/module.cpp @@ -31,7 +31,6 @@ #include "morpheus/stages/preprocess_nlp.hpp" // for PreprocessNLPStage, PreprocessNLPStageInterfaceProxy #include "morpheus/stages/serialize.hpp" // for SerializeStage, SerializeStageInterfaceProxy #include "morpheus/stages/write_to_file.hpp" // for WriteToFileStage, WriteToFileStageInterfaceProxy -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper #include "morpheus/utilities/http_server.hpp" // for DefaultMaxPayloadSize #include "morpheus/version.hpp" // for morpheus_VERSION_MAJOR, morpheus_VERSION_MINOR, morp... @@ -64,9 +63,6 @@ PYBIND11_MODULE(stages, _module) )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - // Make sure to load mrc.core.segment to get ObjectProperties mrc::pymrc::import(_module, "mrc.core.segment"); diff --git a/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp b/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp index 780ad48b37..94fd26aae3 100644 --- a/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp +++ b/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp @@ -17,10 +17,9 @@ #include "../test_utils/common.hpp" // IWYU pragma: associated -#include "morpheus/messages/control.hpp" // for ControlMessage -#include "morpheus/messages/meta.hpp" // for MessageMeta -#include "morpheus/objects/table_info.hpp" // for MutableTableInfo -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/meta.hpp" // for MessageMeta +#include "morpheus/objects/table_info.hpp" // for MutableTableInfo #include #include // for gil_scoped_release, gil_scoped_acquire @@ -34,20 +33,7 @@ using namespace morpheus; using namespace morpheus::test; class TestDevDocEx3 : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - CudfHelper::load(); - } - } -}; +{}; TEST_F(TestDevDocEx3, TestPyObjFromMultiMesg) { diff --git a/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp b/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp index cf53f6ea2a..d1ca4a8dcb 100644 --- a/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp +++ b/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp @@ -19,26 +19,11 @@ #include "../test_utils/common.hpp" // IWYU pragma: associated -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper - #include namespace morpheus::test { class TestMessages : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - CudfHelper::load(); - } - } -}; +{}; } // namespace morpheus::test diff --git a/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp b/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp index cbefd8355e..27f477511b 100644 --- a/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp +++ b/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp @@ -27,7 +27,6 @@ #include "morpheus/stages/inference_client_stage.hpp" // for TensorModelMapping, InferenceClientStage, IInferenceCl... #include "morpheus/stages/triton_inference.hpp" // for TritonInferenceClient, TritonInferInput, TritonInferRe... #include "morpheus/types.hpp" // for TensorMap -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper #include "morpheus/utilities/matx_util.hpp" // for MatxUtil #include // for cudaMemcpy, cudaMemcpyKind @@ -43,7 +42,6 @@ #include // for Error, InferOptions, InferenceServerHttpClient, InferR... #include // for Task #include // for TestScheduler -#include // for gil_scoped_acquire #include // for cuda_stream_per_thread #include // for device_buffer #include // for get_current_device_resource @@ -291,20 +289,7 @@ class ErrorProneTritonClient : public FakeTritonClient }; class TestTritonInferenceStage : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - morpheus::CudfHelper::load(); - } - } -}; +{}; cudf::io::table_with_metadata create_test_table_with_metadata(uint32_t rows) { diff --git a/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp b/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp index 552e5bb8a7..55b5465ae3 100644 --- a/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp +++ b/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp @@ -20,7 +20,6 @@ #include "morpheus/io/deserializers.hpp" #include "morpheus/io/serializers.hpp" #include "morpheus/messages/meta.hpp" -#include "morpheus/utilities/cudf_util.hpp" #include #include @@ -48,20 +47,7 @@ std::string read_file(const std::filesystem::path& file_path) } class TestFileInOut : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - CudfHelper::load(); - } - } -}; +{}; TEST_F(TestFileInOut, RoundTripCSV) { diff --git a/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp b/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp index 1c8eb86fa8..c58f708b6e 100644 --- a/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp +++ b/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp @@ -23,7 +23,6 @@ #include "morpheus/io/loaders/payload.hpp" #include "morpheus/io/loaders/rest.hpp" #include "morpheus/messages/meta.hpp" -#include "morpheus/utilities/cudf_util.hpp" #include "morpheus/utilities/string_util.hpp" #include // for PyStatus_Exception, PyConfig_Clear, PyConfig_InitPythonConfig @@ -81,9 +80,6 @@ void TestWithPythonInterpreter::SetUp() false); pybind11::gil_scoped_acquire gil; - - // Ensure that the cudf helpers are loaded so we can convert dataframes to MessageMeta - CudfHelper::load(); } void TestWithPythonInterpreter::TearDown() {} diff --git a/python/morpheus/morpheus/cli/commands.py b/python/morpheus/morpheus/cli/commands.py index f90faadd52..78e5834e16 100644 --- a/python/morpheus/morpheus/cli/commands.py +++ b/python/morpheus/morpheus/cli/commands.py @@ -37,6 +37,7 @@ from morpheus.config import ConfigFIL from morpheus.config import ConfigOnnxToTRT from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.config import PipelineModes from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging @@ -286,8 +287,14 @@ def install(**kwargs): @click.option('--use_cpp', default=True, type=bool, - help=("Whether or not to use C++ node and message types or to prefer python. " - "Only use as a last resort if bugs are encountered")) + help=("[Deprecated] Whether or not to use C++ node and message types or to prefer python. " + "Only use as a last resort if bugs are encountered. Cannot be used with --use_cpu_only")) +@click.option('--use_cpu_only', + default=False, + type=bool, + is_flag=True, + help=("Whether or not to run in CPU only mode, setting this to True will disable C++ mode. " + "Cannot be used with --use_cpp")) @click.option('--manual_seed', default=None, type=click.IntRange(min=1), @@ -296,8 +303,26 @@ def install(**kwargs): @prepare_command(parse_config=True) def run(ctx: click.Context, **kwargs): """Run subcommand, used for running a pipeline""" - # Since the option isnt the same name as `should_use_cpp` anymore, manually set the value here. - CppConfig.set_should_use_cpp(kwargs.pop("use_cpp", CppConfig.get_should_use_cpp())) + + if (ctx.get_parameter_source("use_cpu_only") is not click.core.ParameterSource.DEFAULT + and ctx.get_parameter_source("use_cpp") is not click.core.ParameterSource.DEFAULT): + # If the user set explicit values for both use_cpu_only and use_cpp raise an error + raise click.UsageError("Cannot set both --use_cpp and --use_cpu_only. The --use_cpp flag is deprecated. " + "Use only --use_cpu_only.") + + use_cpu_only = kwargs.pop("use_cpu_only") + use_cpp = kwargs.pop("use_cpp") + + # only check this value if the flag was explicitly set by the user + if ctx.get_parameter_source("use_cpp") is not click.core.ParameterSource.DEFAULT: + logger.warning("The --use_cpp flag is deprecated and will be removed in a future release") + + execution_mode = ExecutionMode.GPU if use_cpp else ExecutionMode.CPU + else: + execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + + config = get_config_from_ctx(ctx) + config.execution_mode = execution_mode manual_seed_val = kwargs.pop("manual_seed", None) if manual_seed_val is not None: @@ -518,11 +543,6 @@ def pipeline_ae(ctx: click.Context, **kwargs): config = get_config_from_ctx(ctx) config.mode = PipelineModes.AE - - if CppConfig.get_should_use_cpp(): - logger.warning("C++ is disabled for AutoEncoder pipelines at this time.") - CppConfig.set_should_use_cpp(False) - config.ae = ConfigAutoEncoder() config.ae.userid_column_name = kwargs["userid_column_name"] config.ae.timestamp_column_name = kwargs["timestamp_column_name"] diff --git a/python/morpheus/morpheus/config.py b/python/morpheus/morpheus/config.py index 15e0416819..2b0073103e 100644 --- a/python/morpheus/morpheus/config.py +++ b/python/morpheus/morpheus/config.py @@ -140,6 +140,11 @@ class PipelineModes(str, Enum): AE = "AE" +class ExecutionMode(str, Enum): + GPU = "GPU" + CPU = "CPU" + + class CppConfig: """ Allows setting whether C++ implementations should be used for Morpheus stages and messages. Defaults to True, @@ -199,6 +204,7 @@ class Config(ConfigBase): log_config_file : str File corresponding to this Config. """ + execution_mode: ExecutionMode = ExecutionMode.GPU # Whether in Debug mode. debug: bool = False @@ -219,6 +225,41 @@ class Config(ConfigBase): ae: ConfigAutoEncoder = dataclasses.field(default=None) fil: ConfigFIL = dataclasses.field(default=None) + frozen: bool = False + + def freeze(self): + """ + Freeze the Config object, making it immutable. This method will be invoked when the config object is passed to + a pipeline or stage for the first time. + + Calling `freeze` on a frozen instance will not have any effect. + """ + self._check_cpp_mode(fix_mis_match=not self.frozen) + if not self.frozen: + self.frozen = True + + def _check_cpp_mode(self, fix_mis_match: bool = False): + """ + Check if C++ mode matched the execution mode. If ` + + Parameters + ---------- + fix_mis_match : bool + If True, set the C++ mode to the correct value. If False, raise an exception if the value is incorrect. + """ + should_use_cpp: bool = (self.execution_mode == ExecutionMode.GPU) + if fix_mis_match: + CppConfig.set_should_use_cpp(should_use_cpp) + elif CppConfig.get_should_use_cpp() != should_use_cpp: + raise ValueError( + f"Execution mode {self.execution_mode} does not match C++ mode {CppConfig.get_should_use_cpp()}") + + def __setattr__(self, name, value): + # Since __frozen is defined in the __post_init__, the attribute won't exist in the __init__ method. + if self.frozen: + raise dataclasses.FrozenInstanceError("Cannot modify frozen Config object.") + + super().__setattr__(name, value) @property def pipeline_batch_size(self): diff --git a/python/morpheus/morpheus/controllers/file_to_df_controller.py b/python/morpheus/morpheus/controllers/file_to_df_controller.py index c8478c6ce1..e948a78dc4 100644 --- a/python/morpheus/morpheus/controllers/file_to_df_controller.py +++ b/python/morpheus/morpheus/controllers/file_to_df_controller.py @@ -24,8 +24,6 @@ import fsspec import pandas as pd -import cudf - from morpheus.common import FileTypes from morpheus.io.deserializers import read_file_to_df from morpheus.utils.column_info import DataFrameInputSchema @@ -130,7 +128,7 @@ def __init__(self, self._downloader = Downloader(download_method=download_method) def _get_or_create_dataframe_from_batch( - self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[cudf.DataFrame, bool]: + self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[pd.DataFrame, bool]: if (not file_object_batch): raise RuntimeError("No file objects to process") @@ -209,7 +207,7 @@ def convert_to_dataframe(self, file_object_batch: typing.Tuple[fsspec.core.OpenF Returns ------- - cudf.DataFrame + pd.DataFrame The resulting DataFrame. """ diff --git a/python/morpheus/morpheus/controllers/filter_detections_controller.py b/python/morpheus/morpheus/controllers/filter_detections_controller.py index 10da7b7c9b..7737e061d7 100644 --- a/python/morpheus/morpheus/controllers/filter_detections_controller.py +++ b/python/morpheus/morpheus/controllers/filter_detections_controller.py @@ -15,11 +15,11 @@ import logging import typing -import cupy as cp import numpy as np from morpheus.common import FilterSource from morpheus.messages import ControlMessage +from morpheus.utils.type_aliases import NDArrayType logger = logging.getLogger(__name__) @@ -64,18 +64,13 @@ def field_name(self): """ return self._field_name - def _find_detections(self, msg: ControlMessage) -> typing.Union[cp.ndarray, np.ndarray]: + def _find_detections(self, msg: ControlMessage) -> NDArrayType: # Determine the filter source if self._filter_source == FilterSource.TENSOR: filter_source = msg.tensors().get_tensor(self._field_name) else: filter_source = msg.payload().get_data(self._field_name).values - if (isinstance(filter_source, np.ndarray)): - array_mod = np - else: - array_mod = cp - # Get per row detections detections = (filter_source > self._threshold) @@ -83,9 +78,9 @@ def _find_detections(self, msg: ControlMessage) -> typing.Union[cp.ndarray, np.n detections = detections.any(axis=1) # Surround in False to ensure we get an even number of pairs - detections = array_mod.concatenate([array_mod.array([False]), detections, array_mod.array([False])]) + detections = np.concatenate([np.array([False]), detections, np.array([False])]) - return array_mod.where(detections[1:] != detections[:-1])[0].reshape((-1, 2)) + return np.where(detections[1:] != detections[:-1])[0].reshape((-1, 2)) def filter_copy(self, msg: ControlMessage) -> ControlMessage: """ diff --git a/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py b/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py index 8bc1be6829..b7065130c7 100644 --- a/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py +++ b/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py @@ -33,10 +33,11 @@ from mlflow.types.utils import _infer_pandas_column as _mlflow_infer_pandas_column from mlflow.types.utils import _infer_schema -import cudf - from morpheus.messages import ControlMessage -from morpheus.models.dfencoder import AutoEncoder +from morpheus.utils.type_utils import is_cudf_type + +if typing.TYPE_CHECKING: + from morpheus.models.dfencoder import AutoEncoder logger = logging.getLogger(__name__) @@ -235,7 +236,7 @@ def on_data(self, message: ControlMessage) -> ControlMessage: user = message.get_metadata("user_id") - model: AutoEncoder = message.get_metadata("model") + model: "AutoEncoder" = message.get_metadata("model") model_path = "dfencoder" reg_model_name = self.user_id_to_model(user_id=user) @@ -283,7 +284,7 @@ def on_data(self, message: ControlMessage) -> ControlMessage: # prepare_df to show the actual inputs to the model (any extra are discarded) input_df = message.payload().get_data().iloc[0:1] - if isinstance(input_df, cudf.DataFrame): + if is_cudf_type(input_df): input_df = input_df.to_pandas() prepared_df = model.prepare_df(input_df) diff --git a/python/morpheus/morpheus/controllers/monitor_controller.py b/python/morpheus/morpheus/controllers/monitor_controller.py index 21916a3eb7..940d079097 100644 --- a/python/morpheus/morpheus/controllers/monitor_controller.py +++ b/python/morpheus/morpheus/controllers/monitor_controller.py @@ -19,12 +19,12 @@ import fsspec from tqdm import tqdm -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.utils.logger import LogLevels from morpheus.utils.monitor_utils import MorpheusTqdm +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_dataframe logger = logging.getLogger(__name__) @@ -57,6 +57,7 @@ class MonitorController: Custom implementation of tqdm if required. """ + SupportedTypes = typing.Union[DataFrameType, MessageMeta, ControlMessage, list] controller_count: int = 0 def __init__(self, @@ -125,20 +126,19 @@ def refresh_progress(self, _): """ self._progress.refresh() - def progress_sink(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, list]): + def progress_sink(self, msg: SupportedTypes) -> SupportedTypes: """ Receives a message and determines the count of the message. The progress bar is displayed and the progress is updated. Parameters ---------- - msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, typing.List] + msg: SupportedTypes Message that determines the count of the message Returns ------- - msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, list] - + SupportedTypes """ # Make sure the progress bar is shown @@ -158,14 +158,14 @@ def progress_sink(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMe return msg - def auto_count_fn(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, typing.List]): + def auto_count_fn(self, msg: SupportedTypes) -> typing.Callable[[SupportedTypes], int] | None: """ This is a helper function that is used to determine the count of messages received by the monitor. Parameters ---------- - msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, typing.List] + msg: SupportedTypes Message that determines the count of the message Returns @@ -183,7 +183,7 @@ def auto_count_fn(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMe if (isinstance(msg, list) and len(msg) == 0): return None - if (isinstance(msg, cudf.DataFrame)): + if (is_dataframe(msg)): return lambda y: len(y.index) if (isinstance(msg, MessageMeta)): diff --git a/python/morpheus/morpheus/controllers/rss_controller.py b/python/morpheus/morpheus/controllers/rss_controller.py index c8d47d6696..6334f4f23c 100644 --- a/python/morpheus/morpheus/controllers/rss_controller.py +++ b/python/morpheus/morpheus/controllers/rss_controller.py @@ -27,9 +27,10 @@ import requests import requests_cache -import cudf - from morpheus.messages import MessageMeta +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_class logger = logging.getLogger(__name__) @@ -105,7 +106,8 @@ def __init__(self, strip_markup: bool = False, stop_after: int = 0, interval_secs: float = 600, - should_stop_fn: Callable[[], bool] = None): + should_stop_fn: Callable[[], bool] = None, + df_type: DataFrameModule = "cudf"): if IMPORT_EXCEPTION is not None: raise ImportError(IMPORT_ERROR_MESSAGE) from IMPORT_EXCEPTION @@ -141,6 +143,7 @@ def __init__(self, self._run_indefinitely = run_indefinitely self._interval_secs = interval_secs self._interval_td = timedelta(seconds=self._interval_secs) + self._df_class: type[DataFrameType] = get_df_class(df_type) self._enable_cache = enable_cache @@ -349,7 +352,7 @@ def fetch_dataframes(self): Yeilds ------ - cudf.DataFrame + DataFrameType A DataFrame containing feed entry data. Raises @@ -374,14 +377,14 @@ def fetch_dataframes(self): entry_accumulator.append(entry) if self._batch_size > 0 and len(entry_accumulator) >= self._batch_size: - yield cudf.DataFrame(entry_accumulator) + yield self._df_class(entry_accumulator) entry_accumulator.clear() self._previous_entries = current_entries # Yield any remaining entries. if entry_accumulator: - yield cudf.DataFrame(entry_accumulator) + yield self._df_class(entry_accumulator) else: logger.debug("No new entries found.") diff --git a/python/morpheus/morpheus/io/deserializers.py b/python/morpheus/morpheus/io/deserializers.py index 31499b4359..34703ac50e 100644 --- a/python/morpheus/morpheus/io/deserializers.py +++ b/python/morpheus/morpheus/io/deserializers.py @@ -17,23 +17,38 @@ import io import typing -import pandas as pd - -import cudf +import numpy as np from morpheus.common import FileTypes from morpheus.common import determine_file_type from morpheus.common import read_file_to_df as read_file_to_df_cpp from morpheus.config import CppConfig from morpheus.io.utils import filter_null_data +from morpheus.io.utils import get_csv_reader +from morpheus.io.utils import get_json_reader +from morpheus.io.utils import get_parquet_reader +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType +def get_reader(file_type: FileTypes, df_type: DataFrameModule) -> typing.Callable[..., DataFrameType]: + if (file_type == FileTypes.CSV): + return get_csv_reader(df_type) + + if (file_type == FileTypes.JSON): + return get_json_reader(df_type) + + if (file_type == FileTypes.PARQUET): + return get_parquet_reader(df_type) + + raise ValueError(f"Unsupported file type: {file_type}") + + def _read_file_to_df_py(*, file_name: typing.Union[str, io.IOBase], file_type: FileTypes, parser_kwargs: dict, - df_type: typing.Literal["cudf", "pandas"]) -> DataFrameType: + df_type: DataFrameModule) -> DataFrameType: if (parser_kwargs is None): parser_kwargs = {} @@ -59,29 +74,15 @@ def _read_file_to_df_py(*, # Update with any args set by the user. User values overwrite defaults kwargs.update(parser_kwargs) + reader = get_reader(mode, df_type) - df_class = cudf if df_type == "cudf" else pd - - df = None - if (mode == FileTypes.JSON): - df = df_class.read_json(file_name, **kwargs) - - elif (mode == FileTypes.CSV): - df: pd.DataFrame = df_class.read_csv(file_name, **kwargs) - - if (len(df.columns) > 1 and df.columns[0] == "Unnamed: 0" and df.iloc[:, 0].dtype == cudf.dtype(int)): + df: DataFrameType = reader(file_name, **kwargs) + if (mode == FileTypes.CSV): + if (len(df.columns) > 1 and df.columns[0] == "Unnamed: 0" and df.iloc[:, 0].dtype == np.dtype(int)): df.set_index("Unnamed: 0", drop=True, inplace=True) df.index.name = "" df.sort_index(inplace=True) - elif (mode == FileTypes.PARQUET): - df = df_class.read_parquet(file_name, **kwargs) - - else: - assert False, f"Unsupported file type mode: {mode}" - - assert df is not None - return df @@ -90,7 +91,7 @@ def read_file_to_df(file_name: typing.Union[str, io.IOBase], parser_kwargs: dict = None, filter_nulls: bool = True, filter_null_columns: list[str] | str = 'data', - df_type: typing.Literal["cudf", "pandas"] = "pandas") -> DataFrameType: + df_type: DataFrameModule = "pandas") -> DataFrameType: """ Reads a file into a dataframe and performs any of the necessary cleanup. diff --git a/python/morpheus/morpheus/io/serializers.py b/python/morpheus/morpheus/io/serializers.py index 90822ab6f7..b82b82e99c 100644 --- a/python/morpheus/morpheus/io/serializers.py +++ b/python/morpheus/morpheus/io/serializers.py @@ -19,13 +19,12 @@ from io import IOBase from io import StringIO -import cudf - from morpheus.common import FileTypes from morpheus.common import determine_file_type from morpheus.common import write_df_to_file as write_df_to_file_cpp from morpheus.config import CppConfig from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type def df_to_stream_csv(df: DataFrameType, stream: IOBase, include_header=False, include_index_col=True): @@ -203,10 +202,11 @@ def write_df_to_file(df: DataFrameType, file_name: str, file_type: FileTypes = F Additional arguments forwarded to the underlying serialization function. Where the underlying serialization function is one of `write_df_to_file_cpp`, `df_to_stream_csv`, or `df_to_stream_json`. """ - if (CppConfig.get_should_use_cpp() and isinstance(df, cudf.DataFrame)): - # Use the C++ implementation - write_df_to_file_cpp(df=df, filename=file_name, file_type=file_type, **kwargs) - return + if (CppConfig.get_should_use_cpp()): + if (is_cudf_type(df)): + # Use the C++ implementation + write_df_to_file_cpp(df=df, filename=file_name, file_type=file_type, **kwargs) + return mode = file_type diff --git a/python/morpheus/morpheus/io/utils.py b/python/morpheus/morpheus/io/utils.py index 9a20afb4d5..58e5413f91 100644 --- a/python/morpheus/morpheus/io/utils.py +++ b/python/morpheus/morpheus/io/utils.py @@ -14,14 +14,21 @@ # limitations under the License. """IO utilities.""" +import functools import logging +import typing import pandas as pd -import cudf - +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import df_type_str_to_exec_mode +from morpheus.utils.type_utils import is_cudf_type + +if typing.TYPE_CHECKING: + import cudf logger = logging.getLogger(__name__) @@ -44,7 +51,7 @@ def filter_null_data(x: DataFrameType, column_name: str = "data") -> DataFrameTy return x[~x[column_name].isna()] -def cudf_string_cols_exceed_max_bytes(df: cudf.DataFrame, column_max_bytes: dict[str, int]) -> bool: +def cudf_string_cols_exceed_max_bytes(df: "cudf.DataFrame", column_max_bytes: dict[str, int]) -> bool: """ Checks a cudf DataFrame for string columns that exceed a maximum number of bytes and thus need to be truncated by calling `truncate_string_cols_by_bytes`. @@ -64,6 +71,7 @@ def cudf_string_cols_exceed_max_bytes(df: cudf.DataFrame, column_max_bytes: dict bool True if truncation is needed, False otherwise. """ + import cudf if not isinstance(df, cudf.DataFrame): raise ValueError("Expected cudf DataFrame") @@ -101,7 +109,7 @@ def truncate_string_cols_by_bytes(df: DataFrameType, """ performed_truncation = False - is_cudf = isinstance(df, cudf.DataFrame) + is_cudf = is_cudf_type(df) for (col, max_bytes) in column_max_bytes.items(): series: SeriesType = df[col] @@ -124,8 +132,90 @@ def truncate_string_cols_by_bytes(df: DataFrameType, decoded_series = truncated_series.str.decode(encoding='utf-8', errors='ignore') if is_cudf: + import cudf df[col] = cudf.Series.from_pandas(decoded_series) else: df[col] = decoded_series return performed_truncation + + +def _selector_to_exec_mode(selector: DataFrameModule | ExecutionMode) -> ExecutionMode: + if not isinstance(selector, ExecutionMode): + execution_mode = df_type_str_to_exec_mode(selector) + else: + execution_mode = selector + + return execution_mode + + +def _get_df_method(selector: DataFrameModule | ExecutionMode, method_name: str) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate DataFrame method based on the execution mode. + """ + execution_mode = _selector_to_exec_mode(selector) + + if (execution_mode == ExecutionMode.GPU): + import cudf + method = getattr(cudf, method_name) + else: + method = getattr(pd, method_name) + + return method + + +@typing.overload +def get_csv_reader(selector: DataFrameModule) -> typing.Callable[..., DataFrameType]: + ... + + +@typing.overload +def get_csv_reader(selector: ExecutionMode) -> typing.Callable[..., DataFrameType]: + ... + + +def get_csv_reader(selector: DataFrameModule | ExecutionMode) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate CSV reader based on the execution mode. + """ + return _get_df_method(selector, 'read_csv') + + +@typing.overload +def get_json_reader(selector: DataFrameModule) -> typing.Callable[..., DataFrameType]: + ... + + +@typing.overload +def get_json_reader(selector: ExecutionMode) -> typing.Callable[..., DataFrameType]: + ... + + +def get_json_reader(selector: DataFrameModule | ExecutionMode) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate JSON reader based on the execution mode. + """ + execution_mode = _selector_to_exec_mode(selector) + reader = _get_df_method(execution_mode, 'read_json') + + if (execution_mode == ExecutionMode.GPU): + reader = functools.partial(reader, engine='cudf') + + return reader + + +@typing.overload +def get_parquet_reader(selector: DataFrameModule) -> typing.Callable[..., DataFrameType]: + ... + + +@typing.overload +def get_parquet_reader(selector: ExecutionMode) -> typing.Callable[..., DataFrameType]: + ... + + +def get_parquet_reader(selector: DataFrameModule | ExecutionMode) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate Parquet reader based on the execution mode. + """ + return _get_df_method(selector, 'read_parquet') diff --git a/python/morpheus/morpheus/messages/__init__.py b/python/morpheus/morpheus/messages/__init__.py index 867c41fefc..c6cb27c15c 100644 --- a/python/morpheus/morpheus/messages/__init__.py +++ b/python/morpheus/morpheus/messages/__init__.py @@ -18,7 +18,6 @@ # Import order is very important here. Import base classes before child ones # isort: off -from morpheus._lib.messages import ControlMessage from morpheus._lib.messages import DataLoaderRegistry from morpheus._lib.messages import RawPacketMessage from morpheus.messages.memory.tensor_memory import TensorMemory @@ -32,9 +31,12 @@ from morpheus.messages.message_base import MessageBase from morpheus.messages.message_meta import MessageMeta from morpheus.messages.message_meta import UserMessageMeta +from morpheus.messages.control_message import ControlMessageType +from morpheus.messages.control_message import ControlMessage __all__ = [ "ControlMessage", + "ControlMessageType", "DataLoaderRegistry", "InferenceMemory", "InferenceMemoryAE", diff --git a/python/morpheus/morpheus/messages/control_message.py b/python/morpheus/morpheus/messages/control_message.py new file mode 100644 index 0000000000..8c958572e8 --- /dev/null +++ b/python/morpheus/morpheus/messages/control_message.py @@ -0,0 +1,203 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=cyclic-import + +import dataclasses +import logging +import re +import typing +from collections import defaultdict +from collections import deque +from datetime import datetime + +# Users of this module should import ControlMessageType from morpheus.messages, we can't do that here without causing a +# circular import error, instead we import it from the _lib module, we don't want to put `_messages.ControlMessageType` +# in the public API and confuse users +import morpheus._lib.messages as _messages +from morpheus._lib.messages import ControlMessageType # pylint: disable=morpheus-incorrect-lib-from-import +from morpheus.cli.utils import get_enum_keys +from morpheus.cli.utils import get_enum_members +from morpheus.messages.memory.tensor_memory import TensorMemory +from morpheus.messages.message_base import MessageBase +from morpheus.messages.message_meta import MessageMeta + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass(init=False) +class ControlMessage(MessageBase, cpp_class=_messages.ControlMessage): + + def __init__(self, config_or_message: typing.Union["ControlMessage", dict] = None): + super().__init__() + + self._config: dict = {"metadata": {}} + + self._payload: MessageMeta = None + self._tensors: TensorMemory = None + + self._tasks: dict[str, deque] = defaultdict(deque) + self._timestamps: dict[str, datetime] = {} + self._type: ControlMessageType = ControlMessageType.NONE + + if isinstance(config_or_message, dict): + self.config(config_or_message) + elif isinstance(config_or_message, ControlMessage): + self._copy_impl(config_or_message, self) + elif config_or_message is not None: + raise ValueError(f"Invalid argument type {type(config_or_message)}, value must be a dict or ControlMessage") + + def copy(self) -> "ControlMessage": + return self._copy_impl(self) + + def config(self, config: dict = None) -> dict: + if config is not None: + cm_type: str | ControlMessageType = config.get("type") + if cm_type is not None: + if isinstance(cm_type, str): + try: + cm_type = get_enum_members(ControlMessageType)[cm_type] + except KeyError as exc: + enum_names = ", ".join(get_enum_keys(ControlMessageType)) + raise ValueError( + f"Invalid ControlMessageType: {cm_type}, supported types: {enum_names}") from exc + + self._type = cm_type + + tasks = config.get("tasks") + if tasks is not None: + for task in tasks: + self.add_task(task["type"], task["properties"]) + + self._config = {"metadata": config.get("metadata", {}).copy()} + + return self._config + + def has_task(self, task_type: str) -> bool: + """ + Return True if the control message has at least one task of the given type + """ + # Using `get` to avoid creating an empty list if the task type is not present + tasks = self._tasks.get(task_type, []) + return len(tasks) > 0 + + def add_task(self, task_type: str, task: dict): + if isinstance(task_type, str): + cm_type = get_enum_members(ControlMessageType).get(task_type, ControlMessageType.NONE) + if cm_type != ControlMessageType.NONE: + if self._type == ControlMessageType.NONE: + self._type = cm_type + elif self._type != cm_type: + raise ValueError("Cannot mix different types of tasks on the same control message") + + self._tasks[task_type].append(task) + + def remove_task(self, task_type: str) -> dict: + tasks = self._tasks.get(task_type, []) + if len(tasks) == 0: + raise ValueError(f"No task of type {task_type} found") + + return tasks.popleft() + + def get_tasks(self) -> dict[str, deque]: + return self._tasks + + def set_metadata(self, key: str, value: typing.Any): + self._config["metadata"][key] = value + + def has_metadata(self, key: str) -> bool: + return key in self._config["metadata"] + + def get_metadata(self, key: str = None, default_value: typing.Any = None) -> typing.Any: + """ + Return a given piece of metadata, if `key` is `None` return the entire metadata dictionary. + If `key` is not found, `default_value` is returned. + + :param key: The key of the metadata to retrieve, or None for all metadata + :param default_value: The value to return if the key is not found, ignored if `key` is None + :return: The value of the metadata key, or the entire metadata dictionary if `key` is None + """ + + # Not using `get` since `None` is a valid value + if key is None: + return self._config["metadata"] + + return self._config["metadata"].get(key, default_value) + + def list_metadata(self) -> list[str]: + return sorted(self._config["metadata"].keys()) + + def payload(self, payload: MessageMeta = None) -> MessageMeta | None: + if payload is not None: + self._payload = payload + + return self._payload + + def tensors(self, tensors: TensorMemory = None) -> TensorMemory | None: + if tensors is not None: + self._tensors = tensors + + return self._tensors + + def task_type(self, new_task_type: ControlMessageType = None) -> ControlMessageType: + if new_task_type is not None: + self._type = new_task_type + + return self._type + + def set_timestamp(self, key: str, timestamp: datetime): + self._timestamps[key] = timestamp + + def get_timestamp(self, key: str, fail_if_nonexist: bool = False) -> datetime | None: + try: + return self._timestamps[key] + except KeyError as e: + if fail_if_nonexist: + raise ValueError("Timestamp for the specified key does not exist.") from e + return None + + def filter_timestamp(self, regex_filter: str) -> dict[str, datetime]: + re_obj = re.compile(regex_filter) + + return {key: value for key, value in self._timestamps.items() if re_obj.match(key)} + + def _export_config(self) -> dict: + # Unfortunately there is no parity between the `config` object that the constructor accepts and the value + # returned by the `config` method. This method returns a config object that can be used to create a new instance + # with the same task type and tasks. + config = self.config().copy() + config["type"] = self.task_type().name + + tasks = [] + for (task_type, task_queue) in self.get_tasks().items(): + for task in task_queue: + tasks.append({"type": task_type, "properties": task}) + + config["tasks"] = tasks + + return config + + @classmethod + def _copy_impl(cls, src: "ControlMessage", dst: "ControlMessage" = None) -> "ControlMessage": + config = src._export_config() + + if dst is None: + dst = cls() + + dst.config(config) + dst.payload(src.payload()) + dst.tensors(src.tensors()) + dst._timestamps = src._timestamps.copy() + + return dst diff --git a/python/morpheus/morpheus/messages/memory/inference_memory.py b/python/morpheus/morpheus/messages/memory/inference_memory.py index 9bdc7b6503..6913515fe8 100644 --- a/python/morpheus/morpheus/messages/memory/inference_memory.py +++ b/python/morpheus/morpheus/messages/memory/inference_memory.py @@ -15,18 +15,17 @@ import dataclasses -import cupy as cp - import morpheus._lib.messages as _messages from morpheus.messages.data_class_prop import DataClassProp from morpheus.messages.memory.tensor_memory import TensorMemory +from morpheus.utils.type_aliases import NDArrayType @dataclasses.dataclass(init=False) class InferenceMemory(TensorMemory, cpp_class=_messages.InferenceMemory): """ This is a base container class for data that will be used for inference stages. This class is designed to - hold generic tensor data in cupy arrays. + hold generic tensor data in either CuPy or NumPy arrays. """ def get_input(self, name: str): @@ -40,7 +39,7 @@ def get_input(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Inputs corresponding to name. Raises @@ -50,7 +49,7 @@ def get_input(self, name: str): """ return self.get_tensor(name) - def set_input(self, name: str, tensor: cp.ndarray): + def set_input(self, name: str, tensor: NDArrayType): """ Update the input tensor identified by `name`. Alias for `InferenceMemory.set_tensor` @@ -58,8 +57,8 @@ def set_input(self, name: str, tensor: cp.ndarray): ---------- name : str Key used to do lookup in inputs dict of the container. - tensor : cupy.ndarray - Tensor as a CuPy array. + tensor : NDArrayType + Tensor as either CuPy or NumPy array. """ self.set_tensor(name, tensor) @@ -72,23 +71,23 @@ class InferenceMemoryNLP(InferenceMemory, cpp_class=_messages.InferenceMemoryNLP Parameters ---------- - input_ids : cupy.ndarray + input_ids : NDArrayType The token-ids for each string padded with 0s to max_length. - input_mask : cupy.ndarray + input_mask : NDArrayType The mask for token-ids result where corresponding positions identify valid token-id values. - seq_ids : cupy.ndarray + seq_ids : NDArrayType Ids used to index from an inference input to a message. Necessary since there can be more inference inputs than messages (i.e., if some messages get broken into multiple inference requests). """ - input_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) - input_mask: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, + input_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) - seq_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) + input_mask: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) + seq_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) - def __init__(self, *, count: int, input_ids: cp.ndarray, input_mask: cp.ndarray, seq_ids: cp.ndarray): + def __init__(self, *, count: int, input_ids: NDArrayType, input_mask: NDArrayType, seq_ids: NDArrayType): super().__init__(count=count, tensors={'input_ids': input_ids, 'input_mask': input_mask, 'seq_ids': seq_ids}) @@ -100,19 +99,19 @@ class InferenceMemoryFIL(InferenceMemory, cpp_class=_messages.InferenceMemoryFIL Parameters ---------- - input__0 : cupy.ndarray + input__0 : NDArrayType Inference input. - seq_ids : cupy.ndarray + seq_ids : NDArrayType Ids used to index from an inference input to a message. Necessary since there can be more inference inputs than messages (i.e., if some messages get broken into multiple inference requests). """ - input__0: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, + input__0: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) + seq_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) - seq_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) - def __init__(self, *, count: int, input__0: cp.ndarray, seq_ids: cp.ndarray): + def __init__(self, *, count: int, input__0: NDArrayType, seq_ids: NDArrayType): super().__init__(count=count, tensors={'input__0': input__0, 'seq_ids': seq_ids}) @@ -123,16 +122,16 @@ class InferenceMemoryAE(InferenceMemory, cpp_class=None): Parameters ---------- - inputs : cupy.ndarray + inputs : NDArrayType Inference input. - seq_ids : cupy.ndarray + seq_ids : NDArrayType Ids used to index from an inference input to a message. Necessary since there can be more inference inputs than messages (i.e., if some messages get broken into multiple inference requests). """ - input: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) - seq_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) + input: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) + seq_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) - def __init__(self, *, count: int, inputs: cp.ndarray, seq_ids: cp.ndarray): + def __init__(self, *, count: int, inputs: NDArrayType, seq_ids: NDArrayType): super().__init__(count=count, tensors={'input': inputs, 'seq_ids': seq_ids}) diff --git a/python/morpheus/morpheus/messages/memory/response_memory.py b/python/morpheus/morpheus/messages/memory/response_memory.py index eb4318f928..bcf6a4c61b 100644 --- a/python/morpheus/morpheus/messages/memory/response_memory.py +++ b/python/morpheus/morpheus/messages/memory/response_memory.py @@ -16,12 +16,13 @@ import dataclasses import logging -import cupy as cp +import pandas as pd import morpheus._lib.messages as _messages from morpheus.messages.data_class_prop import DataClassProp from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.utils import logger as morpheus_logger +from morpheus.utils.type_aliases import NDArrayType logger = logging.getLogger(__name__) @@ -45,7 +46,7 @@ def get_output(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Tensors corresponding to name. Raises @@ -56,7 +57,7 @@ def get_output(self, name: str): """ return self.get_tensor(name) - def set_output(self, name: str, tensor: cp.ndarray): + def set_output(self, name: str, tensor: NDArrayType): """ Update the output tensor identified by `name`. Alias for `ResponseMemory.set_tensor` @@ -64,8 +65,8 @@ def set_output(self, name: str, tensor: cp.ndarray): ---------- name : str Key used to do lookup in tensors dict of the container. - tensor : cupy.ndarray - Tensor as a CuPy array. + tensor : NDArrayType + Tensor as either a CuPy or NumPy array. Raises ------ @@ -82,12 +83,12 @@ class ResponseMemoryProbs(ResponseMemory, cpp_class=_messages.ResponseMemoryProb Parameters ---------- - probs : cupy.ndarray + probs : NDArrayType Probabilities tensor """ - probs: dataclasses.InitVar[cp.ndarray] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) + probs: dataclasses.InitVar[NDArrayType] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) - def __init__(self, *, count: int, probs: cp.ndarray): + def __init__(self, *, count: int, probs: NDArrayType): super().__init__(count=count, tensors={'probs': probs}) @@ -98,7 +99,7 @@ class ResponseMemoryAE(ResponseMemory, cpp_class=None): Parameters ---------- - probs : cupy.ndarray + probs : NDArrayType Probabilities tensor user_id : str @@ -108,9 +109,9 @@ class ResponseMemoryAE(ResponseMemory, cpp_class=None): Explainability Dataframe, for each feature a column will exist with a name in the form of: `{feature}_z_loss` containing the loss z-score along with `max_abs_z` and `mean_abs_z` columns """ - probs: dataclasses.InitVar[cp.ndarray] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) - user_id = "" - explain_df = None + probs: dataclasses.InitVar[NDArrayType] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) + user_id: str = "" + explain_df: pd.DataFrame = None - def __init__(self, *, count: int, probs: cp.ndarray): + def __init__(self, *, count: int, probs: NDArrayType): super().__init__(count=count, tensors={'probs': probs}) diff --git a/python/morpheus/morpheus/messages/memory/tensor_memory.py b/python/morpheus/morpheus/messages/memory/tensor_memory.py index 103240b15f..2e3164585e 100644 --- a/python/morpheus/morpheus/messages/memory/tensor_memory.py +++ b/python/morpheus/morpheus/messages/memory/tensor_memory.py @@ -16,30 +16,30 @@ import dataclasses import typing -import cupy as cp - import morpheus._lib.messages as _messages from morpheus.messages.message_base import MessageData +from morpheus.utils.type_aliases import NDArrayType +from morpheus.utils.type_aliases import TensorMapType @dataclasses.dataclass(init=False) class TensorMemory(MessageData, cpp_class=_messages.TensorMemory): """ This is a base container class for data that will be used for inference stages. This class is designed to - hold generic tensor data in cupy arrays. + hold generic tensor data in either CuPy or NumPy arrays. Parameters ---------- count : int Length of each tensor contained in `tensors`. - tensors : typing.Dict[str, cupy.ndarray] + tensors : TensorMapType Collection of tensors uniquely identified by a name. """ count: int - tensors: typing.Dict[str, cp.ndarray] = dataclasses.field(repr=False) + tensors: TensorMapType = dataclasses.field(repr=False) - def __init__(self, *, count: int = None, tensors: typing.Dict[str, cp.ndarray] = None): + def __init__(self, *, count: int = None, tensors: TensorMapType = None): self.count = count @@ -50,11 +50,11 @@ def __init__(self, *, count: int = None, tensors: typing.Dict[str, cp.ndarray] = self._tensors = tensors - def _check_tensors(self, tensors: typing.Dict[str, cp.ndarray]): + def _check_tensors(self, tensors: TensorMapType): for tensor in tensors.values(): self._check_tensor(tensor) - def _check_tensor(self, tensor: cp.ndarray): + def _check_tensor(self, tensor: NDArrayType): if (tensor.shape[0] != self.count): class_name = type(self).__name__ raise ValueError( @@ -96,18 +96,18 @@ def get_tensors(self): Returns ------- - typing.Dict[str, cp.ndarray] + TensorMapType """ return self._tensors - def set_tensors(self, tensors: typing.Dict[str, cp.ndarray]): + def set_tensors(self, tensors: TensorMapType): """ Overwrite the tensors stored by this instance. If the length of the tensors has changed, then the `count` property should also be updated. Parameters ---------- - tensors : typing.Dict[str, cupy.ndarray] + tensors : TensorMapType Collection of tensors uniquely identified by a name. """ self._check_tensors(tensors) @@ -124,7 +124,7 @@ def get_tensor(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Tensor. Raises @@ -145,7 +145,7 @@ def _get_tensor_prop(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Tensor. Raises @@ -158,7 +158,7 @@ def _get_tensor_prop(self, name: str): except KeyError as e: raise AttributeError from e - def set_tensor(self, name: str, tensor: cp.ndarray): + def set_tensor(self, name: str, tensor: NDArrayType): """ Update the tensor identified by `name`. @@ -166,15 +166,13 @@ def set_tensor(self, name: str, tensor: cp.ndarray): ---------- name : str Tensor key name. - tensor : cupy.ndarray - Tensor as a CuPy array. + tensor : NDArrayType + Tensor as either a CuPy or NumPy array. Raises ------ ValueError If the number of rows in `tensor` does not match `count` """ - # Ensure that we have 2D array here (`ensure_2d` inserts the wrong axis) - reshaped_tensor = tensor if tensor.ndim == 2 else cp.reshape(tensor, (tensor.shape[0], -1)) - self._check_tensor(reshaped_tensor) - self._tensors[name] = reshaped_tensor + self._check_tensor(tensor) + self._tensors[name] = tensor diff --git a/python/morpheus/morpheus/messages/message_meta.py b/python/morpheus/morpheus/messages/message_meta.py index ecf542b553..4a3507fdf6 100644 --- a/python/morpheus/morpheus/messages/message_meta.py +++ b/python/morpheus/morpheus/messages/message_meta.py @@ -18,15 +18,14 @@ import typing import warnings -import cupy as cp import numpy as np import pandas as pd -import cudf - import morpheus._lib.messages as _messages from morpheus.messages.message_base import MessageBase +from morpheus.utils import logger as morpheus_logger from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType logger = logging.getLogger(__name__) @@ -49,7 +48,7 @@ class MutableTableCtxMgr: def __init__(self, meta) -> None: self.__dict__['__meta'] = meta - def __enter__(self) -> pd.DataFrame: + def __enter__(self) -> DataFrameType: meta = self.__dict__['__meta'] meta._mutex.acquire() return meta._df @@ -206,7 +205,7 @@ def get_meta_range(self, idx = self._df.index[mess_offset:mess_offset + message_count] - if (isinstance(idx, cudf.RangeIndex)): + if (isinstance(idx, pd.RangeIndex)): idx = slice(idx.start, idx.stop - 1, idx.step) if (columns is None): @@ -216,15 +215,15 @@ def get_meta_range(self, return self._df.loc[idx, columns] @typing.overload - def get_data(self) -> cudf.DataFrame: + def get_data(self) -> DataFrameType: ... @typing.overload - def get_data(self, columns: str) -> cudf.Series: + def get_data(self, columns: str) -> SeriesType: ... @typing.overload - def get_data(self, columns: typing.List[str]) -> cudf.DataFrame: + def get_data(self, columns: typing.List[str]) -> DataFrameType: ... def get_data(self, columns: typing.Union[None, str, typing.List[str]] = None): @@ -277,10 +276,6 @@ def set_data(self, columns: typing.Union[None, str, typing.List[str]], value): # First try to set the values on just our slice if the columns exist column_indexer = self._get_col_indexers(df, columns=columns) - # Check if the value is a cupy array and we have a pandas dataframe, convert to numpy - if (isinstance(value, cp.ndarray) and isinstance(df, pd.DataFrame)): - value = value.get() - # Check to see if we are adding a column. If so, we need to use df.loc instead of df.iloc if (-1 not in column_indexer): @@ -299,35 +294,8 @@ def set_data(self, columns: typing.Union[None, str, typing.List[str]], value): # Columns should never be empty if we get here assert columns is not None - # cudf is really bad at adding new columns - if (isinstance(df, cudf.DataFrame)): - - # TODO(morpheus#1487): This logic no longer works in CUDF 24.04. - # We should find a way to reinable the no-dropped-index path as - # that should be more performant than dropping the index. - # # saved_index = None - - # # # Check to see if we can use slices - # # if (not (df.index.is_unique and - # # (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing))): - # # # Save the index and reset - # # saved_index = df.index - # # df.reset_index(drop=True, inplace=True) - - # # # Perform the update via slices - # # df.loc[df.index[row_indexer], columns] = value - - # # # Reset the index if we changed it - # # if (saved_index is not None): - # # df.set_index(saved_index, inplace=True) - - saved_index = df.index - df.reset_index(drop=True, inplace=True) - df.loc[df.index[:], columns] = value - df.set_index(saved_index, inplace=True) - else: - # Now set the slice - df.loc[:, columns] = value + # Now set the slice + df.loc[:, columns] = value def get_slice(self, start, stop): """ @@ -350,12 +318,7 @@ def get_slice(self, start, stop): return MessageMeta(df.iloc[start:stop]) def _ranges_to_mask(self, df, ranges): - if isinstance(df, cudf.DataFrame): - zeros_fn = cp.zeros - else: - zeros_fn = np.zeros - - mask = zeros_fn(len(df), bool) + mask = np.zeros(len(df), bool) for range_ in ranges: mask[range_[0]:range_[1]] = True @@ -399,6 +362,8 @@ class UserMessageMeta(MessageMeta, cpp_class=None): user_id: str = dataclasses.field(init=False) def __init__(self, df: pd.DataFrame, user_id: str) -> None: + from morpheus.messages.control_message import ControlMessage + morpheus_logger.deprecated_message_warning(UserMessageMeta, ControlMessage) super().__init__(df) self.user_id = user_id @@ -418,5 +383,7 @@ class AppShieldMessageMeta(MessageMeta, cpp_class=None): source: str = dataclasses.field(init=False) def __init__(self, df: pd.DataFrame, source: str) -> None: + from morpheus.messages.control_message import ControlMessage + morpheus_logger.deprecated_message_warning(AppShieldMessageMeta, ControlMessage) super().__init__(df) self.source = source diff --git a/python/morpheus/morpheus/modules/filter_detections.py b/python/morpheus/morpheus/modules/filter_detections.py index 94ef301862..c0793a6092 100644 --- a/python/morpheus/morpheus/modules/filter_detections.py +++ b/python/morpheus/morpheus/modules/filter_detections.py @@ -81,7 +81,7 @@ def filter_detections(builder: mrc.Builder): field_name = config.get("field_name", "probs") threshold = config.get("threshold", 0.5) filter_source = config.get("filter_source", "AUTO") - use_cpp = config.get("use_cpp", False) + use_cpp = config.get("use_cpp", True) filter_source_dict = {"AUTO": FilterSource.Auto, "DATAFRAME": FilterSource.DATAFRAME, "TENSOR": FilterSource.TENSOR} diff --git a/python/morpheus/morpheus/modules/payload_batcher.py b/python/morpheus/morpheus/modules/payload_batcher.py index ca62a252bd..d3372e40e3 100644 --- a/python/morpheus/morpheus/modules/payload_batcher.py +++ b/python/morpheus/morpheus/modules/payload_batcher.py @@ -13,14 +13,11 @@ # limitations under the License. import logging -import typing import warnings import mrc from mrc.core import operators as ops -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.utils.control_message_utils import cm_default_failure_context_manager @@ -28,6 +25,9 @@ from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_ids import PAYLOAD_BATCHER from morpheus.utils.module_utils import register_module +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_pkg_from_obj +from morpheus.utils.type_utils import is_cudf_type logger = logging.getLogger(__name__) @@ -103,7 +103,7 @@ def payload_batcher(builder: mrc.Builder): @cm_skip_processing_if_failed @cm_default_failure_context_manager(raise_on_failure=raise_on_failure) - def on_next(control_message: ControlMessage) -> typing.List[ControlMessage]: + def on_next(control_message: ControlMessage) -> list[ControlMessage]: nonlocal disable_max_batch_size message_meta = control_message.payload() @@ -119,7 +119,7 @@ def on_next(control_message: ControlMessage) -> typing.List[ControlMessage]: return control_messages - def _batch_dataframe(df: cudf.DataFrame) -> typing.List[cudf.DataFrame]: + def _batch_dataframe(df: DataFrameType) -> list[DataFrameType]: nonlocal max_batch_size dfm_length = len(df) @@ -131,7 +131,7 @@ def _batch_dataframe(df: cudf.DataFrame) -> typing.List[cudf.DataFrame]: dfs = [df.iloc[i * max_batch_size:(i + 1) * max_batch_size] for i in range(num_batches)] return dfs - def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame]: + def _batch_dataframe_by_group(df: DataFrameType) -> list[DataFrameType]: nonlocal max_batch_size nonlocal group_by_columns nonlocal timestamp_column_name @@ -143,9 +143,14 @@ def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame] if has_timestamp_column: # Apply timestamp pattern and group by the formatted timestamp column - df[period_column] = cudf.to_datetime(df[timestamp_column_name], format=timestamp_pattern) - # Period object conversion is not supported in cudf - df[period_column] = df[period_column].to_pandas().dt.to_period(period).astype('str') + df_pkg = get_df_pkg_from_obj(df) + period_series = df_pkg.to_datetime(df[timestamp_column_name], format=timestamp_pattern) + + if is_cudf_type(df): + # Period object conversion is not supported in cudf + period_series = period_series.to_pandas() + + df[period_column] = period_series.dt.to_period(period).astype('str') if len(group_by_columns) == 1: # Avoid warning from cudf regardning an upcoming change of behavior when applying a groupby to a single diff --git a/python/morpheus/morpheus/parsers/event_parser.py b/python/morpheus/morpheus/parsers/event_parser.py index a82785b48a..e2e23e5836 100644 --- a/python/morpheus/morpheus/parsers/event_parser.py +++ b/python/morpheus/morpheus/parsers/event_parser.py @@ -14,13 +14,14 @@ """Abstract class for all event log parsers.""" import logging -import typing from abc import ABC from abc import abstractmethod import yaml -import cudf +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj log = logging.getLogger(__name__) @@ -31,13 +32,13 @@ class EventParser(ABC): Parameters ---------- - columns: typing.Set[str] + columns: set[str] Event column names event_name: str Event name """ - def __init__(self, columns: typing.Set[str], event_name: str): + def __init__(self, columns: set[str], event_name: str): self._columns = columns self._event_name = event_name @@ -48,7 +49,7 @@ def columns(self): Returns ------- - typing.Set[str] + set[str] Event column names """ return self._columns @@ -66,7 +67,7 @@ def event_name(self): return self._event_name @abstractmethod - def parse(self, text: cudf.Series) -> cudf.Series: + def parse(self, text: SeriesType) -> SeriesType: """ Abstract method 'parse' triggers the parsing functionality. Subclasses are required to implement and execute any parsing pre-processing steps. @@ -74,25 +75,26 @@ def parse(self, text: cudf.Series) -> cudf.Series: log.info("Begin parsing of dataframe") pass - def parse_raw_event(self, text: cudf.Series, event_regex: typing.Dict[str, any]) -> cudf.DataFrame: + def parse_raw_event(self, text: SeriesType, event_regex: dict[str, str]) -> DataFrameType: """ Processes parsing of a specific type of raw event records received as a dataframe. Parameters ---------- - text : cudf.Series + text : SeriesType Raw event log text to be parsed. - event_regex: typing.Dict[str, any] + event_regex: typing.Dict[str, str] Required regular expressions for a given event type. Returns ------- - cudf.DataFrame + DataFrameType Parsed logs dataframe """ log.debug("Parsing raw events. Event type: %s", self.event_name) - parsed_gdf = cudf.DataFrame({col: [""] for col in self.columns}) + df_pkg = get_df_pkg_from_obj(text) + parsed_gdf = df_pkg.DataFrame({col: [""] for col in self.columns}) parsed_gdf = parsed_gdf[:0] event_specific_columns = event_regex.keys() # Applies regex pattern for each expected output column to raw data @@ -109,7 +111,7 @@ def parse_raw_event(self, text: cudf.Series, event_regex: typing.Dict[str, any]) return parsed_gdf - def _load_regex_yaml(self, yaml_file) -> typing.Dict[str, any]: + def _load_regex_yaml(self, yaml_file) -> dict[str, str]: """Returns a dictionary of event regexes contained in the given yaml file.""" with open(yaml_file, encoding='UTF-8') as yaml_file_h: regex_dict = yaml.safe_load(yaml_file_h) diff --git a/python/morpheus/morpheus/parsers/ip.py b/python/morpheus/morpheus/parsers/ip.py index 1fcb75ee81..a177f49082 100644 --- a/python/morpheus/morpheus/parsers/ip.py +++ b/python/morpheus/morpheus/parsers/ip.py @@ -12,24 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. +import ipaddress + import numpy as np +import pandas as pd -import cudf +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj +from morpheus.utils.type_utils import is_cudf_type -def ip_to_int(values): +def ip_to_int(values: SeriesType) -> SeriesType: """ Convert string column of IP addresses to integer values. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - values : cudf.Series + values : SeriesType IPv4 addresses to be converted Returns ------- - rtype : cudf.Series + rtype : SeriesType Integer representations of IP addresses Examples @@ -41,22 +46,26 @@ def ip_to_int(values): 1 167772161 dtype: uint32 """ - return values.str.ip2int() + if (is_cudf_type(values)): + return values.str.ip2int() + + # Pandas does not have an ip2int method + return values.apply(lambda x: int(ipaddress.IPv4Address(x))) -def int_to_ip(values): +def int_to_ip(values: SeriesType) -> SeriesType: """ Convert integer column to IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - values : cudf.Series + values : SeriesType uint32 representations of IP addresses Returns ------- - rtype : cudf.Series + rtype : SeriesType IPv4 addresses Examples @@ -68,22 +77,27 @@ def int_to_ip(values): 1 10.0.0.1 dtype: object """ - return cudf.Series._from_column(values._column.int2ip()) + if (is_cudf_type(values)): + import cudf + return cudf.Series._from_column(values._column.int2ip()) + + # Pandas does not have an int2ip method + return values.apply(lambda x: str(ipaddress.IPv4Address(x))) -def is_ip(ips: str): +def is_ip(ips: SeriesType) -> SeriesType: """ Indicates whether each address is an ip string. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -95,23 +109,26 @@ def is_ip(ips: str): 1 False dtype: bool """ + if (is_cudf_type(ips)): + return ips.str.isipv4() + is_ip_regex = r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" - return ips.str.match(is_ip_regex) + return ips.str.fullmatch(is_ip_regex) -def is_reserved(ips): +def is_reserved(ips: SeriesType) -> SeriesType: """ Indicates whether each address is reserved. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -129,19 +146,19 @@ def is_reserved(ips): return ips.str.match(reserved_ipv4_regex) -def is_loopback(ips): +def is_loopback(ips: SeriesType) -> SeriesType: """ Indicates whether each address is loopback. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -159,19 +176,19 @@ def is_loopback(ips): return ips.str.match(loopback_ipv4_regex) -def is_link_local(ips): +def is_link_local(ips: SeriesType) -> SeriesType: """ Indicates whether each address is link local. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -189,19 +206,19 @@ def is_link_local(ips): return ips.str.match(link_local_ipv4_regex) -def is_unspecified(ips): +def is_unspecified(ips: SeriesType) -> SeriesType: """ Indicates whether each address is unspecified. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -217,19 +234,19 @@ def is_unspecified(ips): return ips.str.match(unspecified_regex) -def is_multicast(ips): +def is_multicast(ips: SeriesType) -> SeriesType: """ Indicates whether each address is multicast. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -247,19 +264,19 @@ def is_multicast(ips): return ips.str.match(is_multicast_ipv4_regex) -def is_private(ips): +def is_private(ips: SeriesType) -> SeriesType: """ Indicates whether each address is private. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -290,19 +307,19 @@ def is_private(ips): return ips.str.match(private_regex) -def is_global(ips): +def is_global(ips: SeriesType) -> SeriesType: """ Indicates whether each address is global. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -323,7 +340,7 @@ def is_global(ips): return result -def _netmask_kernel(idx, out1, out2, out3, out4, kwarg1): +def _mask_kernel(idx, out1, out2, out3, out4, kwarg1): for i, _ in enumerate(idx): out1[i] = int(kwarg1 / 16777216) % 256 out2[i] = int(kwarg1 / 65536) % 256 @@ -331,21 +348,52 @@ def _netmask_kernel(idx, out1, out2, out3, out4, kwarg1): out4[i] = int(kwarg1) % 256 -def netmask(ips, prefixlen=16): +def _mask_pandas(df_cols: tuple[int], mask_: int, series_name: str) -> pd.Series: + outputs = [int(mask_ / 16777216) % 256, int(mask_ / 65536) % 256, int(mask_ / 256) % 256, int(mask_) % 256] + return pd.Series([df_cols.idx, ".".join(map(str, outputs))], index=["idx", series_name]) + + +def _compute_mask_impl(ips: SeriesType, mask_: int, series_name: str) -> SeriesType: + df_pkg = get_df_pkg_from_obj(ips) + if is_cudf_type(ips): + df = df_pkg.DataFrame() + df["idx"] = ips.index + x = df.apply_rows( + _mask_kernel, + incols=["idx"], + outcols={ + "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 + }, + kwargs={"kwarg1": mask_}, + ) + + out1 = x["out1"].astype(str) + out2 = x["out2"].astype(str) + out3 = x["out3"].astype(str) + out4 = x["out4"].astype(str) + df[series_name] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") + else: + df = df_pkg.DataFrame({"idx": ips.index}) + df = df.apply(_mask_pandas, axis=1, args=(mask_, series_name)) + + return df[series_name] + + +def netmask(ips: SeriesType, prefixlen: int = 16) -> SeriesType: """ Compute a column of netmasks for a column of IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked prefixlen: int Length of the network prefix, in bits, for IPv4 addresses Returns ------- - rtype : cudf.Series + rtype : SeriesType Netmask ouput from set of IP address @@ -360,48 +408,24 @@ def netmask(ips, prefixlen=16): """ all_ones = (2**32) - 1 mask_int = all_ones ^ (all_ones >> prefixlen) - df = cudf.DataFrame() - df["idx"] = ips.index - x = df.apply_rows( - _netmask_kernel, - incols=["idx"], - outcols={ - "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 - }, - kwargs={"kwarg1": mask_int}, - ) - - out1 = x["out1"].astype(str) - out2 = x["out2"].astype(str) - out3 = x["out3"].astype(str) - out4 = x["out4"].astype(str) - df["net_mask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") - return df["net_mask"] - - -def _hostmask_kernel(idx, out1, out2, out3, out4, kwarg1): - for i, _ in enumerate(idx): - out1[i] = int(kwarg1 / 16777216) % 256 - out2[i] = int(kwarg1 / 65536) % 256 - out3[i] = int(kwarg1 / 256) % 256 - out4[i] = int(kwarg1) % 256 + return _compute_mask_impl(ips, mask_int, "net_mask") -def hostmask(ips, prefixlen=16): +def hostmask(ips: SeriesType, prefixlen: int = 16) -> SeriesType: """ Compute a column of hostmasks for a column of IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked prefixlen: integer Length of the network prefix, in bits, for IPv4 addresses Returns ------- - rtype : cudf.Series + rtype : SeriesType Hostmask ouput from set of IP address Examples @@ -415,24 +439,10 @@ def hostmask(ips, prefixlen=16): """ all_ones = (2**32) - 1 host_mask_int = int(all_ones ^ (all_ones >> prefixlen)) ^ all_ones - df = cudf.DataFrame() - df["idx"] = ips.index - x = df.apply_rows(_hostmask_kernel, - incols=["idx"], - outcols={ - "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 - }, - kwargs={"kwarg1": host_mask_int}) - - out1 = x["out1"].astype(str) - out2 = x["out2"].astype(str) - out3 = x["out3"].astype(str) - out4 = x["out4"].astype(str) - df["hostmask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") - return df["hostmask"] - - -def _mask_kernel(masked_ip_int, out1, out2, out3, out4, kwarg1): # pylint: disable=unused-argument + return _compute_mask_impl(ips, host_mask_int, "hostmask") + + +def _mask_series_kernel(masked_ip_int, out1, out2, out3, out4, kwarg1): # pylint: disable=unused-argument for i, ipnum in enumerate(masked_ip_int): out1[i] = int(ipnum / 16777216) % 256 out2[i] = int(ipnum / 65536) % 256 @@ -440,21 +450,25 @@ def _mask_kernel(masked_ip_int, out1, out2, out3, out4, kwarg1): # pylint: disa out4[i] = int(ipnum) % 256 -def mask(ips, masks): +def _mask_series_pandas(df_cols: tuple[int], mask_series_name: str, output_series_name: str) -> pd.Series: + return _mask_pandas(df_cols, df_cols[mask_series_name], output_series_name) + + +def mask(ips: SeriesType, masks: SeriesType) -> SeriesType: """ Apply a mask to a column of IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked - masks: cudf.Series + masks: SeriesType The host or subnet masks to be applied Returns ------- - rtype : cudf.Series + rtype : SeriesType Masked IP address from list of IPs Examples @@ -468,21 +482,28 @@ def mask(ips, masks): 1 10.0.0.0 Name: mask, dtype: object """ - df = cudf.DataFrame() - df["int_mask"] = masks.str.ip2int() - df["int_ip"] = ips.str.ip2int() + df_pkg = get_df_pkg_from_obj(ips) + + df = df_pkg.DataFrame() + df["int_mask"] = ip_to_int(masks) + df["int_ip"] = ip_to_int(ips) df["masked_ip_int"] = df["int_mask"] & df["int_ip"] - x = df.apply_rows(_mask_kernel, - incols=["masked_ip_int"], - outcols={ - "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 - }, - kwargs={"kwarg1": 0}) - - out1 = x["out1"].astype(str) - out2 = x["out2"].astype(str) - out3 = x["out3"].astype(str) - out4 = x["out4"].astype(str) - df["mask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") + if (is_cudf_type(df)): + x = df.apply_rows(_mask_series_kernel, + incols=["masked_ip_int"], + outcols={ + "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 + }, + kwargs={"kwarg1": 0}) + + out1 = x["out1"].astype(str) + out2 = x["out2"].astype(str) + out3 = x["out3"].astype(str) + out4 = x["out4"].astype(str) + df["mask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") + else: + df["idx"] = ips.index + df = df.apply(_mask_series_pandas, axis=1, args=("masked_ip_int", "mask")) + return df["mask"] diff --git a/python/morpheus/morpheus/parsers/url_parser.py b/python/morpheus/morpheus/parsers/url_parser.py index bf3077a601..88a8b56ddf 100644 --- a/python/morpheus/morpheus/parsers/url_parser.py +++ b/python/morpheus/morpheus/parsers/url_parser.py @@ -13,21 +13,15 @@ # limitations under the License. import os - -import cudf +import types import morpheus +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj +from morpheus.utils.type_utils import is_cudf_type - -def _load_suffix_file(): - suffix_list_path = os.path.join(morpheus.DATA_DIR, "public_suffix_list.dat") - # Read suffix list csv file - suffix_df = cudf.io.csv.read_csv(suffix_list_path, names=["suffix"], header=None, dtype=["str"]) - suffix_df = suffix_df[suffix_df["suffix"].str.contains("^[^//]+$")] - return suffix_df - - -_SUFFIX_DF = _load_suffix_file() +_SUFFIX_DF_CACHE = {} _ALLOWED_OUTPUT_COLS = { "hostname", "subdomain", @@ -36,7 +30,24 @@ def _load_suffix_file(): } -def _handle_unknown_suffix(unknown_suffix_df, col_dict): +def _get_suffix_df(df_pkg: types.ModuleType) -> DataFrameType: + suffix_df = _SUFFIX_DF_CACHE.get(df_pkg) + if suffix_df is None: + suffix_list_path = os.path.join(morpheus.DATA_DIR, "public_suffix_list.dat") + # Read suffix list csv file, ignore comments and empty lines. + suffix_df = df_pkg.read_csv(suffix_list_path, + names=["suffix"], + header=None, + dtype={'suffix': "str"}, + comment='/', + skip_blank_lines=True) + suffix_df = suffix_df[suffix_df["suffix"].str.contains("^[^//]+$")] + _SUFFIX_DF_CACHE[df_pkg] = suffix_df + + return suffix_df + + +def _handle_unknown_suffix(unknown_suffix_df: DataFrameType, col_dict: dict[str, bool]) -> DataFrameType: if col_dict["hostname"]: unknown_suffix_df = unknown_suffix_df[["idx", "tld0"]] unknown_suffix_df = unknown_suffix_df.rename(columns={"tld0": "hostname"}) @@ -53,7 +64,8 @@ def _handle_unknown_suffix(unknown_suffix_df, col_dict): return unknown_suffix_df -def _extract_tld(input_df, suffix_df, col_len, col_dict): +def _extract_tld(input_df: DataFrameType, suffix_df: DataFrameType, col_len: int, + col_dict: dict[str, bool]) -> DataFrameType: tmp_dfs = [] # Left join on single column dataframe does not provide expected results hence adding dummy column. suffix_df["dummy"] = "" @@ -109,12 +121,14 @@ def _extract_tld(input_df, suffix_df, col_len, col_dict): tmp_dfs.append(unknown_suffix_df) else: continue + # Concat all temporary output dataframes - output_df = cudf.concat(tmp_dfs) + df_pkg = get_df_pkg_from_obj(input_df) + output_df = df_pkg.concat(tmp_dfs) return output_df -def _create_col_dict(allowed_output_cols, req_cols): +def _create_col_dict(allowed_output_cols: set[str], req_cols: set[str]) -> dict[str, bool]: """Creates dictionary to apply check condition while extracting tld. """ col_dict = {col: True for col in allowed_output_cols} @@ -124,7 +138,7 @@ def _create_col_dict(allowed_output_cols, req_cols): return col_dict -def _verify_req_cols(req_cols, allowed_output_cols): +def _verify_req_cols(req_cols: set[str], allowed_output_cols: set[str]) -> set[str]: """Verify user requested columns against allowed output columns. """ if req_cols is not None: @@ -135,7 +149,7 @@ def _verify_req_cols(req_cols, allowed_output_cols): return req_cols -def _generate_tld_cols(hostname_split_df, hostnames, col_len): +def _generate_tld_cols(hostname_split_df: DataFrameType, hostnames: SeriesType, col_len: int) -> DataFrameType: hostname_split_df = hostname_split_df.fillna("") hostname_split_df["tld" + str(col_len)] = hostname_split_df[col_len] # Add all other elements of hostname_split_df @@ -147,25 +161,25 @@ def _generate_tld_cols(hostname_split_df, hostnames, col_len): return hostname_split_df -def _extract_hostnames(urls): +def _extract_hostnames(urls: SeriesType) -> SeriesType: hostnames = urls.str.extract("([\\w]+[\\.].*[^/]|[\\-\\w]+[\\.].*[^/])")[0].str.extract("([\\w\\.\\-]+)")[0] return hostnames -def parse(urls, req_cols=None): +def parse(urls: SeriesType, req_cols: set[str] = None) -> DataFrameType: """ Extract hostname, domain, subdomain and suffix from URLs. Parameters ---------- - urls : cudf.Series + urls : SeriesType URLs to be parsed. req_cols : typing.Set[str] Selected columns to extract. Can be subset of (hostname, domain, subdomain and suffix). Returns ------- - cudf.DataFrame + DataFrameType Parsed dataframe with selected columns to extract. Examples @@ -196,6 +210,7 @@ def parse(urls, req_cols=None): 2 github com 3 pydata org """ + df_pkg = get_df_pkg_from_obj(urls) req_cols = _verify_req_cols(req_cols, _ALLOWED_OUTPUT_COLS) col_dict = _create_col_dict(req_cols, _ALLOWED_OUTPUT_COLS) hostnames = _extract_hostnames(urls) @@ -203,14 +218,21 @@ def parse(urls, req_cols=None): del urls hostname_split_ser = hostnames.str.findall("([^.]+)") hostname_split_df = hostname_split_ser.to_frame() - hostname_split_df = cudf.DataFrame(hostname_split_df[0].to_arrow().to_pylist()) + + if is_cudf_type(hostname_split_df): + hostname_split_df = df_pkg.DataFrame(hostname_split_df[0].to_arrow().to_pylist()) + else: + hostname_split_df = df_pkg.DataFrame(hostname_split_df[0].to_list()) + col_len = len(hostname_split_df.columns) - 1 hostname_split_df = _generate_tld_cols(hostname_split_df, hostnames, col_len) # remove hostnames since they are available in hostname_split_df del hostnames # Assign input index to idx column. hostname_split_df["idx"] = url_index - output_df = _extract_tld(hostname_split_df, _SUFFIX_DF, col_len, col_dict) + + suffix_df = _get_suffix_df(df_pkg) + output_df = _extract_tld(hostname_split_df, suffix_df, col_len, col_dict) # Sort index based on given input index order. output_df = output_df.sort_values("idx", ascending=True) # Drop temp columns. diff --git a/python/morpheus/morpheus/parsers/windows_event_parser.py b/python/morpheus/morpheus/parsers/windows_event_parser.py index 475c4a405d..8b62c2cf0e 100644 --- a/python/morpheus/morpheus/parsers/windows_event_parser.py +++ b/python/morpheus/morpheus/parsers/windows_event_parser.py @@ -16,10 +16,11 @@ import os import typing -import cudf - import morpheus from morpheus.parsers.event_parser import EventParser +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj log = logging.getLogger(__name__) @@ -41,17 +42,17 @@ def __init__(self, interested_eventcodes=None): self._event_regex = self._load_regex_yaml(regex_filepath) EventParser.__init__(self, self.get_columns(), self.EVENT_NAME) - def parse(self, text: cudf.Series) -> cudf.Series: + def parse(self, text: SeriesType) -> DataFrameType: """Parses the Windows raw event. Parameters ---------- - text : cudf.Series + text : SeriesType Raw event log text to be parsed Returns ------- - cudf.DataFrame + DataFrameType Parsed logs dataframe """ # Clean raw data to be consistent. @@ -65,23 +66,25 @@ def parse(self, text: cudf.Series) -> cudf.Series: temp = self.parse_raw_event(input_chunk, self._event_regex[eventcode]) if not temp.empty: output_chunks.append(temp) - parsed_dataframe = cudf.concat(output_chunks) + + df_pkg = get_df_pkg_from_obj(text) + parsed_dataframe = df_pkg.concat(output_chunks) # Replace null values with empty. parsed_dataframe = parsed_dataframe.fillna("") return parsed_dataframe - def clean_raw_data(self, text: cudf.Series) -> cudf.Series: + def clean_raw_data(self, text: SeriesType) -> SeriesType: """ Lower casing and replacing escape characters. Parameters ---------- - text : cudf.Series + text : SeriesType Raw event log text to be clean Returns ------- - cudf.Series + SeriesType Clean raw event log text """ text = (text.str.lower().str.replace("\\\\t", "").str.replace("\\\\r", "").str.replace("\\\\n", "|")) diff --git a/python/morpheus/morpheus/parsers/zeek.py b/python/morpheus/morpheus/parsers/zeek.py index 44ef464e1f..bc8d5683b7 100644 --- a/python/morpheus/morpheus/parsers/zeek.py +++ b/python/morpheus/morpheus/parsers/zeek.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cudf +from morpheus.io.utils import get_csv_reader +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType TYPE_DICT = { "bool": "bool", @@ -36,7 +38,7 @@ } -def parse(filepath: str) -> cudf.DataFrame: +def parse(filepath: str, df_type: DataFrameModule = "cudf") -> DataFrameType: """ Parse Zeek log file and return cuDF dataframe. Uses header comments to get column names/types and configure parser. @@ -45,20 +47,23 @@ def parse(filepath: str) -> cudf.DataFrame: ---------- filepath : str File path of Zeek log file + df_type : DataFrameTypeStr, default 'cudf' + Type of dataframe to return. Either 'cudf' or 'pandas' Returns ------- - cudf.DataFrame + DataFrameType Parsed Zeek log dataframe """ - header_gdf = cudf.read_csv(filepath, names=["line"], nrows=8) + csv_reader = get_csv_reader(df_type) + header_gdf = csv_reader(filepath, names=["line"], nrows=8) lines_gdf = header_gdf["line"].str.split() column_names = lines_gdf.iloc[6][1:] column_types = lines_gdf.iloc[7][1:] column_dtypes = list(map(lambda x: TYPE_DICT.get(x, "str"), column_types)) - log_gdf = cudf.read_csv( + log_gdf = csv_reader( filepath, delimiter="\t", dtype=column_dtypes, diff --git a/python/morpheus/morpheus/pipeline/execution_mode_mixins.py b/python/morpheus/morpheus/pipeline/execution_mode_mixins.py new file mode 100644 index 0000000000..e5483501fa --- /dev/null +++ b/python/morpheus/morpheus/pipeline/execution_mode_mixins.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Mixins to indicate which execution modes are supported for a given stage. These mixins should be used for any stage +that needs to support execution modes other than the default GPU mode, and the supported execution modes do not change +based upon configuration or runtime conditions. +""" + +import types +from abc import ABC + +from morpheus.config import ExecutionMode +from morpheus.utils import type_utils +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType + + +class CpuOnlyMixin(ABC): + """ + Mixin intented to be added to stages which support only CPU execution modes. + """ + + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.CPU, ) + + +class GpuAndCpuMixin(ABC): + """ + Mixin intented to be added to stages which support both GPU and CPU execution modes. + """ + + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.GPU, ExecutionMode.CPU) + + @property + def df_type_str(self) -> DataFrameModule: + """ + Returns the DataFrame module that should be used for the given execution mode. + """ + return type_utils.exec_mode_to_df_type_str(self._config.execution_mode) + + def get_df_pkg(self) -> types.ModuleType: + """ + Returns the DataFrame package that should be used for the given execution mode. + """ + return type_utils.get_df_pkg(self._config.execution_mode) + + def get_df_class(self) -> type[DataFrameType]: + """ + Returns the DataFrame class that should be used for the given execution mode. + """ + return type_utils.get_df_class(self._config.execution_mode) diff --git a/python/morpheus/morpheus/pipeline/linear_pipeline.py b/python/morpheus/morpheus/pipeline/linear_pipeline.py index 7b8a4a767c..add998fa4c 100644 --- a/python/morpheus/morpheus/pipeline/linear_pipeline.py +++ b/python/morpheus/morpheus/pipeline/linear_pipeline.py @@ -148,6 +148,7 @@ def add_segment_boundary(self, data_type=None, as_shared_pointer=False): raise RuntimeError("Cannot create a segment boundary, current segment is empty.") empty_config = Config() + empty_config.execution_mode = self._execution_mode boundary_egress = LinearBoundaryEgressStage(empty_config, boundary_port_id=self._current_segment_id, data_type=data_type) diff --git a/python/morpheus/morpheus/pipeline/pipeline.py b/python/morpheus/morpheus/pipeline/pipeline.py index 6f719e4d54..1298f6f020 100644 --- a/python/morpheus/morpheus/pipeline/pipeline.py +++ b/python/morpheus/morpheus/pipeline/pipeline.py @@ -60,6 +60,7 @@ class Pipeline(): """ def __init__(self, config: Config): + config.freeze() self._mutex = threading.RLock() @@ -91,6 +92,8 @@ def __init__(self, config: Config): # Future that allows post_start to propagate exceptions back to pipeline self._post_start_future: asyncio.Future = None + self._execution_mode = config.execution_mode + @property def state(self) -> PipelineState: return self._state diff --git a/python/morpheus/morpheus/pipeline/preallocator_mixin.py b/python/morpheus/morpheus/pipeline/preallocator_mixin.py index e50b4e2070..a06f7e1532 100644 --- a/python/morpheus/morpheus/pipeline/preallocator_mixin.py +++ b/python/morpheus/morpheus/pipeline/preallocator_mixin.py @@ -17,18 +17,16 @@ from abc import ABC from collections import OrderedDict -import cupy as cp import mrc import numpy as np import pandas as pd from mrc.core import operators as ops -import cudf - from morpheus.common import TypeId from morpheus.common import typeid_is_fully_supported from morpheus.common import typeid_to_numpy_str from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.utils.type_aliases import DataFrameType @@ -40,7 +38,7 @@ class PreallocatorMixin(ABC): """ Mixin intented to be added to stages, typically source stages, which are emitting newly constructed DataFrame or - MessageMeta instances into the segment. During segment build, if the `_needed_columns` addtribut is not empty an + MessageMeta instances into the segment. During segment build, if the `_needed_columns` addtribute is not empty an additional node will be inserted into the graph after the derived class' node which will perform the allocation. The exceptions would be non-source stages like DFP's `DFPFileToDataFrameStage` which are not sources but are @@ -59,7 +57,9 @@ def set_needed_columns(self, needed_columns: OrderedDict): def _preallocate_df(self, df: DataFrameType) -> DataFrameType: missing_columns = [col for col in self._needed_columns.keys() if col not in df.columns] if len(missing_columns) > 0: - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): + # assume cudf.DataFrame + import cupy as cp alloc_func = cp.zeros else: alloc_func = np.zeros @@ -118,12 +118,19 @@ def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) node = builder.make_node(node_name, ops.map(self._preallocate_meta)) else: raise RuntimeError(f"Unsupported output type {pretty_type}") - elif issubclass(out_type, (cudf.DataFrame, pd.DataFrame)): - node = builder.make_node(node_name, ops.map(self._preallocate_df)) + else: - msg = ("Additional columns were requested to be inserted into the Dataframe, but the output type " - f"{pretty_type} isn't a supported type") - raise RuntimeError(msg) + supported_df_types = [pd.DataFrame] + if self._config.execution_mode == ExecutionMode.GPU: + import cudf + supported_df_types.append(cudf.DataFrame) + + if issubclass(out_type, tuple(supported_df_types)): + node = builder.make_node(node_name, ops.map(self._preallocate_df)) + else: + msg = ("Additional columns were requested to be inserted into the Dataframe, but the output type " + f"{pretty_type} isn't a supported type") + raise RuntimeError(msg) builder.make_edge(out_node, node) out_node = node diff --git a/python/morpheus/morpheus/pipeline/single_port_stage.py b/python/morpheus/morpheus/pipeline/single_port_stage.py index b9ea20aeeb..49687afc96 100644 --- a/python/morpheus/morpheus/pipeline/single_port_stage.py +++ b/python/morpheus/morpheus/pipeline/single_port_stage.py @@ -83,7 +83,6 @@ def _build(self, builder: mrc.Builder, input_nodes: list[mrc.SegmentObject]) -> def _post_build_single(self, _: mrc.Builder, out_node: mrc.SegmentObject) -> mrc.SegmentObject: return out_node - @typing.final def _post_build(self, builder: mrc.Builder, out_ports_nodes: list[mrc.SegmentObject]) -> list[mrc.SegmentObject]: ret_val = self._post_build_single(builder, out_ports_nodes[0]) diff --git a/python/morpheus/morpheus/pipeline/stage_base.py b/python/morpheus/morpheus/pipeline/stage_base.py index ebae6cbfef..ebb5541166 100644 --- a/python/morpheus/morpheus/pipeline/stage_base.py +++ b/python/morpheus/morpheus/pipeline/stage_base.py @@ -27,6 +27,7 @@ import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.utils.atomic_integer import AtomicInteger from morpheus.utils.type_utils import _DecoratorType @@ -84,6 +85,7 @@ class StageBase(ABC, collections.abc.Hashable): def __init__(self, config: Config): # Save the config + config.freeze() self._config = config self._id = StageBase.__ID_COUNTER.get_and_inc() @@ -285,6 +287,19 @@ def supports_cpp_node(self) -> bool: # return False pass + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. By default this returns `(ExecutionMode.GPU,)`. + Subclasses can override this method to specify different execution modes. + + For most stages the values will be static, and this can be accomplished by making use of either the + `CpuOnlyMixin` or `GpuAndCpuMixin` mixins. + + However, complex stages may choose to make this decision at runtime, in which case this method should be + overridden. directly within the stage class. + """ + return (ExecutionMode.GPU, ) + def _build_cpp_node(self): """ Specifies whether to build a C++ node. Only should be called during the build phase. @@ -347,6 +362,14 @@ def can_build(self, check_ports=False) -> bool: def _pre_build(self, do_propagate: bool = True): assert not self.is_built, "build called prior to _pre_build" assert not self.is_pre_built, "Can only pre-build stages once!" + + # Check the execution mode + if (self._config.execution_mode not in self.supported_execution_modes()): + supported_modes = ", ".join(str(x) for x in self.supported_execution_modes()) + raise RuntimeError(f"Unsupported execution mode {self._config.execution_mode} for stage {self.name}, " + f"supported exexution modes are {supported_modes}") + + # Perform schema validation schema = _pipeline.StageSchema(self) self._pre_compute_schema(schema) self.compute_schema(schema) diff --git a/python/morpheus/morpheus/pipeline/stage_decorator.py b/python/morpheus/morpheus/pipeline/stage_decorator.py index bede41b3e2..fb2412a325 100644 --- a/python/morpheus/morpheus/pipeline/stage_decorator.py +++ b/python/morpheus/morpheus/pipeline/stage_decorator.py @@ -22,11 +22,10 @@ import pandas as pd from mrc.core import operators as ops -import cudf - import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.common import TypeId from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import MessageMeta logger = logging.getLogger(__name__) @@ -123,7 +122,13 @@ class WrappedFunctionSourceStage(_pipeline.SingleOutputSource): Function to use for computing the schema of the stage. """ - def __init__(self, config: Config, *, name: str, gen_fn: GeneratorType, compute_schema_fn: ComputeSchemaType): + def __init__(self, + config: Config, + *, + name: str, + gen_fn: GeneratorType, + compute_schema_fn: ComputeSchemaType, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, )): super().__init__(config) # collections.abc.Generator is a subclass of collections.abc.Iterator if not inspect.isgeneratorfunction(gen_fn): @@ -132,6 +137,7 @@ def __init__(self, config: Config, *, name: str, gen_fn: GeneratorType, compute_ self._name = name self._gen_fn = gen_fn self._compute_schema_fn = compute_schema_fn + self._supported_execution_modes = execution_modes @property def name(self) -> str: @@ -143,6 +149,12 @@ def supports_cpp_node(self) -> bool: def compute_schema(self, schema: _pipeline.StageSchema): self._compute_schema_fn(schema) + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return self._supported_execution_modes + def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: return builder.make_source(self.unique_name, self._gen_fn) @@ -172,7 +184,8 @@ def source( gen_fn: GeneratorType = None, *, name: str = None, - compute_schema_fn: ComputeSchemaType = None + compute_schema_fn: ComputeSchemaType = None, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, ) ) -> typing.Callable[typing.Concatenate[Config, _P], WrappedFunctionSourceStage]: """ Decorator for wrapping a function as a source stage. The function must be a generator method, and provide a @@ -196,7 +209,10 @@ def source( >>> pipe.set_source(source_gen(config, dataframes=[df])) """ if gen_fn is None: - return functools.partial(source, name=name, compute_schema_fn=compute_schema_fn) + return functools.partial(source, + name=name, + compute_schema_fn=compute_schema_fn, + execution_modes=execution_modes) # Use wraps to ensure user's don't lose their function name and docstrinsgs, however we do want to override the # annotations to reflect that the returned function requires a config and returns a stage @@ -236,18 +252,25 @@ def compute_schema_fn_inner(schema: _pipeline.StageSchema): bound_gen_fn = functools.partial(gen_fn, **kwargs) + pre_allocation_output_types = [pd.DataFrame, MessageMeta] + if config.execution_mode == ExecutionMode.GPU: + import cudf + pre_allocation_output_types.append(cudf.DataFrame) + # If the return type supports pre-allocation we use the pre-allocating source - if return_type in (pd.DataFrame, cudf.DataFrame, MessageMeta): + if return_type in pre_allocation_output_types: return PreAllocatedWrappedFunctionStage(config=config, name=name, gen_fn=bound_gen_fn, - compute_schema_fn=compute_schema_fn) + compute_schema_fn=compute_schema_fn, + execution_modes=execution_modes) return WrappedFunctionSourceStage(config=config, name=name, gen_fn=bound_gen_fn, - compute_schema_fn=compute_schema_fn) + compute_schema_fn=compute_schema_fn, + execution_modes=execution_modes) return wrapper @@ -276,16 +299,15 @@ class WrappedFunctionStage(_pipeline.SinglePortStage): by the `PreAllocatedWrappedFunctionStage` to ensure the DataFrame has the needed columns allocated. """ - def __init__( - self, - config: Config, - *, - name: str = None, - on_data_fn: typing.Callable, - accept_type: type, - compute_schema_fn: ComputeSchemaType, - needed_columns: dict[str, TypeId] = None, - ): + def __init__(self, + config: Config, + *, + name: str = None, + on_data_fn: typing.Callable, + accept_type: type, + compute_schema_fn: ComputeSchemaType, + needed_columns: dict[str, TypeId] = None, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, )): super().__init__(config) self._name = name self._on_data_fn = on_data_fn @@ -295,6 +317,8 @@ def __init__( if needed_columns is not None: self._needed_columns.update(needed_columns) + self._supported_execution_modes = execution_modes + @property def name(self) -> str: return self._name @@ -308,6 +332,12 @@ def supports_cpp_node(self) -> bool: def compute_schema(self, schema: _pipeline.StageSchema): self._compute_schema_fn(schema) + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return self._supported_execution_modes + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: node = builder.make_node(self.unique_name, ops.map(self._on_data_fn)) builder.make_edge(input_node, node) @@ -318,12 +348,15 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> DecoratedStageType = typing.Callable[typing.Concatenate[Config, _P], WrappedFunctionStage] -def stage(on_data_fn: typing.Callable[typing.Concatenate[_InputT, _P], _OutputT] = None, - *, - name: str = None, - accept_type: type = None, - compute_schema_fn: ComputeSchemaType = None, - needed_columns: dict[str, TypeId] = None) -> DecoratedStageType: +def stage( + on_data_fn: typing.Callable[typing.Concatenate[_InputT, _P], _OutputT] = None, + *, + name: str = None, + accept_type: type = None, + compute_schema_fn: ComputeSchemaType = None, + needed_columns: dict[str, TypeId] = None, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, ) +) -> DecoratedStageType: """ Decorator for wrapping a function as a stage. The function must receive at least one argument, the first argument must be the incoming message, and must return a value. @@ -359,7 +392,8 @@ def stage(on_data_fn: typing.Callable[typing.Concatenate[_InputT, _P], _OutputT] name=name, accept_type=accept_type, compute_schema_fn=compute_schema_fn, - needed_columns=needed_columns) + needed_columns=needed_columns, + execution_modes=execution_modes) # Use wraps to ensure user's don't lose their function name and docstrinsgs, however we do want to override the # annotations to reflect that the returned function requires a config and returns a stage @@ -410,6 +444,7 @@ def compute_schema_fn_inner(schema: _pipeline.StageSchema): on_data_fn=bound_on_data_fn, accept_type=accept_type, compute_schema_fn=compute_schema_fn, - needed_columns=needed_columns) + needed_columns=needed_columns, + execution_modes=execution_modes) return wrapper diff --git a/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py b/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py index ad8db9ebc2..c1e42169c2 100644 --- a/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py +++ b/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py @@ -20,6 +20,7 @@ from morpheus.config import Config from morpheus.pipeline.boundary_stage_mixin import BoundaryStageMixin +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource @@ -29,7 +30,7 @@ logger = logging.getLogger(__name__) -class LinearBoundaryEgressStage(BoundaryStageMixin, PassThruTypeMixin, SinglePortStage): +class LinearBoundaryEgressStage(BoundaryStageMixin, PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ The LinearBoundaryEgressStage acts as an egress point from one linear segment to another. Given an existing linear pipeline that we want to connect to another segment, a linear boundary egress stage would be added, in conjunction @@ -82,7 +83,7 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> return input_node -class LinearBoundaryIngressStage(BoundaryStageMixin, PreallocatorMixin, SingleOutputSource): +class LinearBoundaryIngressStage(BoundaryStageMixin, PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ The LinearBoundaryIngressStage acts as source ingress point from a corresponding egress in another linear segment. Given an existing linear pipeline that we want to connect to another segment, a linear boundary egress stage would diff --git a/python/morpheus/morpheus/stages/general/monitor_stage.py b/python/morpheus/morpheus/stages/general/monitor_stage.py index cc3a96f33f..821fe729bd 100644 --- a/python/morpheus/morpheus/stages/general/monitor_stage.py +++ b/python/morpheus/morpheus/stages/general/monitor_stage.py @@ -22,6 +22,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.controllers.monitor_controller import MonitorController +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.logger import LogLevels @@ -30,7 +31,7 @@ @register_stage("monitor", ignore_args=["determine_count_fn"]) -class MonitorStage(PassThruTypeMixin, SinglePortStage): +class MonitorStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Display throughput numbers at a specific point in the pipeline. diff --git a/python/morpheus/morpheus/stages/general/multi_processing_stage.py b/python/morpheus/morpheus/stages/general/multi_processing_stage.py index 8011ae7591..fbe60f410a 100644 --- a/python/morpheus/morpheus/stages/general/multi_processing_stage.py +++ b/python/morpheus/morpheus/stages/general/multi_processing_stage.py @@ -22,6 +22,7 @@ import mrc.core.operators as ops from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils.shared_process_pool import SharedProcessPool @@ -229,6 +230,12 @@ def name(self) -> str: """Return the name of the stage.""" return self._name + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.GPU, ExecutionMode.CPU) + def _on_data(self, data: InputT) -> OutputT: task = self._shared_process_pool.submit_task(self.name, self._process_fn, data) result = task.result() diff --git a/python/morpheus/morpheus/stages/general/trigger_stage.py b/python/morpheus/morpheus/stages/general/trigger_stage.py index b8b754d910..3164a84b64 100644 --- a/python/morpheus/morpheus/stages/general/trigger_stage.py +++ b/python/morpheus/morpheus/stages/general/trigger_stage.py @@ -19,6 +19,7 @@ from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -26,7 +27,7 @@ @register_stage("trigger") -class TriggerStage(PassThruTypeMixin, SinglePortStage): +class TriggerStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Buffer data until the previous stage has completed. diff --git a/python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py b/python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py index 32f3c569ad..f731d77a36 100644 --- a/python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py +++ b/python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py @@ -18,7 +18,6 @@ import numpy as np import pandas as pd -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes @@ -65,7 +64,7 @@ def build_output_message(self, msg: ControlMessage) -> ControlMessage: output_message = ControlMessage(msg) output_message.payload(msg.payload()) - output_message.tensors(_messages.TensorMemory(count=output_dims[0], tensors={"probs": cp.zeros(output_dims)})) + output_message.tensors(TensorMemory(count=output_dims[0], tensors={"probs": cp.zeros(output_dims)})) return output_message @@ -87,7 +86,7 @@ def process(self, batch: ControlMessage, callback: typing.Callable[[TensorMemory """ - data = batch.payload().get_data(batch.payload().df.columns.intersection(self._feature_columns)) + data = batch.payload().get_data(batch.payload().df.columns.intersection(self._feature_columns)).to_pandas() explain_cols = [x + "_z_loss" for x in self._feature_columns] + ["max_abs_z", "mean_abs_z"] explain_df = pd.DataFrame(np.empty((batch.tensors().count, (len(self._feature_columns) + 2)), dtype=object), diff --git a/python/morpheus/morpheus/stages/inference/inference_stage.py b/python/morpheus/morpheus/stages/inference/inference_stage.py index d5c58937aa..4219064432 100644 --- a/python/morpheus/morpheus/stages/inference/inference_stage.py +++ b/python/morpheus/morpheus/stages/inference/inference_stage.py @@ -14,14 +14,12 @@ import logging import typing -from abc import abstractmethod from functools import partial import cupy as cp import mrc from mrc.core import operators as ops -import morpheus._lib.messages as _messages from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages.memory.tensor_memory import TensorMemory @@ -82,15 +80,14 @@ def build_output_message(self, msg: ControlMessage) -> ControlMessage: dims = self.calc_output_dims(msg) output_dims = (msg.payload().count, *dims[1:]) - memory = _messages.TensorMemory(count=output_dims[0], tensors={'probs': cp.zeros(output_dims)}) + memory = TensorMemory(count=output_dims[0], tensors={'probs': cp.zeros(output_dims)}) output_message = ControlMessage(msg) output_message.payload(msg.payload()) output_message.tensors(memory) return output_message - @abstractmethod - def calc_output_dims(self, msg: ControlMessage) -> typing.Tuple: + def calc_output_dims(self, msg: ControlMessage) -> tuple: """ Calculates the dimensions of the inference output message data given an input message. @@ -101,12 +98,11 @@ def calc_output_dims(self, msg: ControlMessage) -> typing.Tuple: Returns ------- - typing.Tuple + tuple Output dimensions of response. """ - pass + raise NotImplementedError("No Python implementation provided by this stage") - @abstractmethod def process(self, batch: ControlMessage, callback: typing.Callable[[TensorMemory], None]): """ Main inference processing function. This function will be called once for each mini-batch. Once the inference is @@ -121,7 +117,7 @@ def process(self, batch: ControlMessage, callback: typing.Callable[[TensorMemory Callback to set the values for the inference response. """ - pass + raise NotImplementedError("No Python implementation provided by this stage") class InferenceStage(ControlMessageStage): @@ -152,15 +148,21 @@ class InferenceStage(ControlMessageStage): ---------- c : `morpheus.config.Config` Pipeline configuration instance. - + thread_count : int, optional + Number of threads to use for inference. If not provided, the `num_threads` attribute of the `Config` object + will be used. """ - def __init__(self, c: Config): + def __init__(self, c: Config, thread_count: int = None): super().__init__(c) + # GPU only stage, assuming all messages are cuDF/CuPy based + import cudf + self._cudf = cudf + self._fea_length = c.feature_length - self._thread_count = c.num_threads + self._thread_count = thread_count or c.num_threads self._workers: typing.List[InferenceWorker] = [] self._inf_queue = ProducerConsumerQueue() @@ -173,13 +175,13 @@ def __init__(self, c: Config): def name(self) -> str: return "inference" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Accepted input types to this stage. Returns ------- - typing.Tuple + tuple Tuple of input types. """ return (ControlMessage, ) @@ -187,11 +189,10 @@ def accepted_types(self) -> typing.Tuple: def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(ControlMessage) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: # Default to False unless derived classes override this value return False - @abstractmethod def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> InferenceWorker: """ Returns the main inference worker which manages requests possibly in another thread depending on which mode the @@ -209,7 +210,7 @@ def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> InferenceWo `InferenceWorker` Inference worker implementation for stage. """ - pass + raise NotImplementedError("No Python implementation provided by this stage") def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: raise NotImplementedError("No C++ node is available for this inference type") @@ -327,7 +328,7 @@ def _split_batches(msg: ControlMessage, max_batch_size: int) -> typing.List[Cont out_msg.payload(msg.payload().get_slice(start, stop)) - out_msg_tensors = _messages.TensorMemory(count=stop - start, tensors={}) + out_msg_tensors = TensorMemory(count=stop - start, tensors={}) for (name, tensor) in msg.tensors().get_tensors().items(): out_msg_tensors.set_tensor(name, tensor[start:stop]) out_msg.tensors(out_msg_tensors) diff --git a/python/morpheus/morpheus/stages/inference/triton_inference_stage.py b/python/morpheus/morpheus/stages/inference/triton_inference_stage.py index a90fe6a983..62f0a51d8e 100644 --- a/python/morpheus/morpheus/stages/inference/triton_inference_stage.py +++ b/python/morpheus/morpheus/stages/inference/triton_inference_stage.py @@ -28,7 +28,6 @@ from tritonclient.utils import InferenceServerException from tritonclient.utils import triton_to_np_dtype -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes @@ -685,6 +684,9 @@ class TritonInferenceStage(InferenceStage): which will be inroduced as: inout_mapping={"mask": "input_mask", "output": "probs"} + thread_count : int, optional + Number of threads to use for inference. If not provided, the `num_threads` attribute of the `Config` object + will be used. """ _INFERENCE_WORKER_DEFAULT_INOUT_MAPPING = { @@ -711,8 +713,9 @@ def __init__(self, needs_logits: bool = None, inout_mapping: dict[str, str] = None, input_mapping: dict[str, str] = None, - output_mapping: dict[str, str] = None): - super().__init__(c) + output_mapping: dict[str, str] = None, + thread_count: int = None): + super().__init__(c, thread_count=thread_count) self._config = c @@ -781,6 +784,7 @@ def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> TritonInfer needs_logits=self._needs_logits) def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: + import morpheus._lib.stages as _stages return _stages.InferenceClientStage(builder, self.unique_name, self._server_url, diff --git a/python/morpheus/morpheus/stages/input/appshield_source_stage.py b/python/morpheus/morpheus/stages/input/appshield_source_stage.py index acd22a54fa..e1e76b4023 100644 --- a/python/morpheus/morpheus/stages/input/appshield_source_stage.py +++ b/python/morpheus/morpheus/stages/input/appshield_source_stage.py @@ -16,7 +16,6 @@ import json import logging import re -import typing from functools import partial from json.decoder import JSONDecodeError @@ -27,8 +26,10 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes -from morpheus.messages.message_meta import AppShieldMessageMeta +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline import SingleOutputSource +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils.directory_watcher import DirectoryWatcher @@ -37,7 +38,7 @@ @register_stage("from-appshield", modes=[PipelineModes.FIL]) -class AppShieldSourceStage(PreallocatorMixin, SingleOutputSource): +class AppShieldSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Source stage is used to load Appshield messages from one or more plugins into a dataframe. It normalizes nested json messages and arranges them into a dataframe by snapshot @@ -77,9 +78,9 @@ class AppShieldSourceStage(PreallocatorMixin, SingleOutputSource): def __init__(self, c: Config, input_glob: str, - plugins_include: typing.List[str], - cols_include: typing.List[str], - cols_exclude: typing.List[str] = None, + plugins_include: list[str], + cols_include: list[str], + cols_exclude: list[str] = None, watch_directory: bool = False, max_files: int = -1, sort_glob: bool = False, @@ -102,6 +103,9 @@ def __init__(self, self._input_count = None + import cudf + self._cudf = cudf + self._watcher = DirectoryWatcher(input_glob=input_glob, watch_directory=watch_directory, max_files=max_files, @@ -124,10 +128,10 @@ def supports_cpp_node(self): return False def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(AppShieldMessageMeta) + schema.output_schema.set_type(ControlMessage) @staticmethod - def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: typing.List[str]): + def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: list[str]): """ Fill missing interested plugin columns. @@ -135,7 +139,7 @@ def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: typing.List[str] ---------- plugin_df : pandas.DataFrame Snapshot plugin dataframe - cols_include : typing.List[str] + cols_include : list[str] Columns that needs to be included. Returns @@ -152,7 +156,7 @@ def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: typing.List[str] return plugin_df @staticmethod - def read_file_to_df(file: io.TextIOWrapper, cols_exclude: typing.List[str]): + def read_file_to_df(file: io.TextIOWrapper, cols_exclude: list[str]): """ Read file content to dataframe. @@ -160,7 +164,7 @@ def read_file_to_df(file: io.TextIOWrapper, cols_exclude: typing.List[str]): ---------- file : `io.TextIOWrapper` Input file object - cols_exclude : typing.List[str] + cols_exclude : list[str] Dropping columns from a dataframe. Returns @@ -185,7 +189,7 @@ def read_file_to_df(file: io.TextIOWrapper, cols_exclude: typing.List[str]): return plugin_df @staticmethod - def load_df(filepath: str, cols_exclude: typing.List[str], encoding: str) -> pd.DataFrame: + def load_df(filepath: str, cols_exclude: list[str], encoding: str) -> pd.DataFrame: """ Reads a file into a dataframe. @@ -193,7 +197,7 @@ def load_df(filepath: str, cols_exclude: typing.List[str], encoding: str) -> pd. ---------- filepath : str Path to a file. - cols_exclude : typing.List[str] + cols_exclude : list[str] Columns that needs to exclude. encoding : str Encoding to read a file. @@ -228,13 +232,13 @@ def load_df(filepath: str, cols_exclude: typing.List[str], encoding: str) -> pd. return plugin_df @staticmethod - def load_meta_cols(filepath_split: typing.List[str], plugin: str, plugin_df: pd.DataFrame) -> pd.DataFrame: + def load_meta_cols(filepath_split: list[str], plugin: str, plugin_df: pd.DataFrame) -> pd.DataFrame: """ Loads meta columns to dataframe. Parameters ---------- - filepath_split : typing.List[str] + filepath_split : list[str] Splits of file path. plugin : str Plugin name to which the data belongs to. @@ -268,20 +272,20 @@ def load_meta_cols(filepath_split: typing.List[str], plugin: str, plugin_df: pd. return plugin_df @staticmethod - def batch_source_split(x: typing.List[pd.DataFrame], source: str) -> typing.Dict[str, pd.DataFrame]: + def batch_source_split(x: list[pd.DataFrame], source: str) -> dict[str, pd.DataFrame]: """ Combines plugin dataframes from multiple snapshot and split dataframe per source. Parameters ---------- - x : typing.List[pd.DataFrame] + x : list[pd.DataFrame] Dataframes from multiple sources. source : str source column name to group it. Returns ------- - typing.Dict[str, pandas.DataFrame] + dict[str, pandas.DataFrame] Grouped dataframes by source. """ @@ -301,30 +305,30 @@ def batch_source_split(x: typing.List[pd.DataFrame], source: str) -> typing.Dict return source_dfs @staticmethod - def files_to_dfs(x: typing.List[str], - cols_include: typing.List[str], - cols_exclude: typing.List[str], - plugins_include: typing.List[str], - encoding: str) -> typing.Dict[str, pd.DataFrame]: + def files_to_dfs(x: list[str], + cols_include: list[str], + cols_exclude: list[str], + plugins_include: list[str], + encoding: str) -> dict[str, pd.DataFrame]: """ Load plugin files into a dataframe, then segment the dataframe by source. Parameters ---------- - x : typing.List[str] + x : list[str] List of file paths. - cols_include : typing.List[str] + cols_include : list[str] Columns that needs to include. - cols_exclude : typing.List[str] + cols_exclude : list[str] Columns that needs to exclude. - plugins_include: typing.List[str] + plugins_include: list[str] For each path in `x`, a list of plugins to load additional meta cols from. encoding : str Encoding to read a file. Returns ------- - typing.Dict[str, pandas.DataFrame] + dict[str, pandas.DataFrame] Grouped dataframes by source. """ # Using pandas to parse nested JSON until cuDF adds support @@ -348,18 +352,19 @@ def files_to_dfs(x: typing.List[str], return df_per_source - @staticmethod - def _build_metadata(x: typing.Dict[str, pd.DataFrame]): + def _build_messages(self, source_dfs: dict[str, pd.DataFrame]): - metas = [] + output_messages = [] - for source, df in x.items(): + for source, df in source_dfs.items(): - # Now make a AppShieldMessageMeta with the source name - meta = AppShieldMessageMeta(df, source) - metas.append(meta) + # Now make a message with the source name + cm = ControlMessage() + cm.payload(MessageMeta(self._cudf.DataFrame(df))) + cm.set_metadata("source", source) + output_messages.append(cm) - return metas + return output_messages def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: # The first source just produces filenames @@ -376,8 +381,8 @@ def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) cols_exclude=self._cols_exclude, plugins_include=self._plugins_include, encoding=self._encoding)), - ops.map(self._build_metadata), - # Finally flatten to single meta + ops.map(self._build_messages), + # Emit each message individually ops.flatten()) builder.make_edge(out_node, post_node) diff --git a/python/morpheus/morpheus/stages/input/arxiv_source.py b/python/morpheus/morpheus/stages/input/arxiv_source.py index b995d3c6b8..34fb3582b3 100644 --- a/python/morpheus/morpheus/stages/input/arxiv_source.py +++ b/python/morpheus/morpheus/stages/input/arxiv_source.py @@ -20,11 +20,11 @@ import mrc.core.operators as ops import pandas as pd -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -41,7 +41,7 @@ @register_stage("from-arxiv") -class ArxivSource(PreallocatorMixin, SingleOutputSource): +class ArxivSource(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage that downloads PDFs from arxiv and converts them to dataframes. @@ -98,6 +98,10 @@ def __init__(self, self._total_chunks = 0 self._cache_dir = cache_dir + if c.execution_mode == ExecutionMode.GPU: + import cudf + self._cudf = cudf + @property def name(self) -> str: """Return the name of the stage""" @@ -195,4 +199,7 @@ def _splitting_pages(self, documents: list["Document"]): df.rename(columns=map_cols, inplace=True) - return MessageMeta(cudf.from_pandas(df)) + if self._config.execution_mode == ExecutionMode.GPU: + df = self._cudf.from_pandas(df) + + return MessageMeta(df) diff --git a/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py b/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py index 6675b3eacd..7174650893 100644 --- a/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py +++ b/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import typing from abc import abstractmethod from functools import partial @@ -23,14 +22,16 @@ from morpheus.common import FileTypes from morpheus.config import Config -from morpheus.messages import UserMessageMeta +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils.directory_watcher import DirectoryWatcher -class AutoencoderSourceStage(PreallocatorMixin, SingleOutputSource): +class AutoencoderSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ All AutoEncoder source stages must extend this class and implement the `files_to_dfs_per_user` abstract method. Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input @@ -91,12 +92,14 @@ def __init__(self, self._input_count = None # Hold the max index we have seen to ensure sequential and increasing indexes - self._rows_per_user: typing.Dict[str, int] = {} + self._rows_per_user: dict[str, int] = {} # Iterative mode will emit dataframes one at a time. Otherwise a list of dataframes is emitted. Iterative mode # is good for interleaving source stages. self._repeat_count = repeat + self._df_class = self.get_df_class() + self._watcher = DirectoryWatcher(input_glob=input_glob, watch_directory=watch_directory, max_files=max_files, @@ -112,7 +115,7 @@ def input_count(self) -> int: return self._input_count if self._input_count is not None else 0 def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(UserMessageMeta) + schema.output_schema.set_type(ControlMessage) def get_match_pattern(self, glob_split): """Return a file match pattern""" @@ -122,7 +125,7 @@ def get_match_pattern(self, glob_split): return match_pattern @staticmethod - def repeat_df(df: pd.DataFrame, repeat_count: int) -> typing.List[pd.DataFrame]: + def repeat_df(df: pd.DataFrame, repeat_count: int) -> list[pd.DataFrame]: """ This function iterates over the same dataframe to extending small datasets in debugging with incremental updates to the `event_dt` and `eventTime` columns. @@ -136,7 +139,7 @@ def repeat_df(df: pd.DataFrame, repeat_count: int) -> typing.List[pd.DataFrame]: Returns ------- - df_array : typing.List[pd.DataFrame] + df_array : list[pd.DataFrame] List of repeated dataframes. """ @@ -159,7 +162,7 @@ def repeat_df(df: pd.DataFrame, repeat_count: int) -> typing.List[pd.DataFrame]: return df_array @staticmethod - def batch_user_split(x: typing.List[pd.DataFrame], + def batch_user_split(x: list[pd.DataFrame], userid_column_name: str, userid_filter: str, datetime_column_name="event_dt"): @@ -168,7 +171,7 @@ def batch_user_split(x: typing.List[pd.DataFrame], Parameters ---------- - x : typing.List[pd.DataFrame] + x : list[pd.DataFrame] List of dataframes. userid_column_name : str Name of a dataframe column used for categorization. @@ -179,7 +182,7 @@ def batch_user_split(x: typing.List[pd.DataFrame], Returns ------- - user_dfs : typing.Dict[str, pd.DataFrame] + user_dfs : dict[str, pd.DataFrame] Dataframes, each of which is associated with a single userid. """ @@ -220,22 +223,22 @@ def batch_user_split(x: typing.List[pd.DataFrame], @staticmethod @abstractmethod - def files_to_dfs_per_user(x: typing.List[str], + def files_to_dfs_per_user(x: list[str], userid_column_name: str, - feature_columns: typing.List[str], + feature_columns: list[str], userid_filter: str = None, - repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: + repeat_count: int = 1) -> dict[str, pd.DataFrame]: """ Stages that extend `AutoencoderSourceStage` must implement this abstract function in order to convert messages in the files to dataframes per userid. Parameters ---------- - x : typing.List[str] + x : list[str] List of messages. userid_column_name : str Name of the column used for categorization. - feature_columns : typing.List[str] + feature_columns : list[str] Feature column names. userid_filter : str Only rows with the supplied userid are filtered. @@ -244,14 +247,14 @@ def files_to_dfs_per_user(x: typing.List[str], Returns ------- - : typing.Dict[str, pd.DataFrame] + : dict[str, pd.DataFrame] Dataframe per userid. """ pass @staticmethod - def derive_features(df: pd.DataFrame, feature_columns: typing.List[str]): # pylint: disable=unused-argument + def derive_features(df: pd.DataFrame, feature_columns: list[str] | None): # pylint: disable=unused-argument """ If any features are available to be derived, can be implemented by overriding this function. @@ -259,28 +262,28 @@ def derive_features(df: pd.DataFrame, feature_columns: typing.List[str]): # pyl ---------- df : pd.DataFrame A dataframe. - feature_columns : typing.List[str] + feature_columns : list[str] Names of columns that are need to be derived. Returns ------- - df : typing.List[pd.DataFrame] + df : list[pd.DataFrame] Dataframe with actual and derived columns. """ return df - def _add_derived_features(self, x: typing.Dict[str, pd.DataFrame]): + def _add_derived_features(self, user_dataframes: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: - for user_name in x.keys(): - x[user_name] = self.derive_features(x[user_name], None) + for user_name in user_dataframes.keys(): + user_dataframes[user_name] = self.derive_features(user_dataframes[user_name], None) - return x + return user_dataframes - def _build_user_metadata(self, x: typing.Dict[str, pd.DataFrame]): + def _build_message(self, user_dataframes: dict[str, pd.DataFrame]) -> list[ControlMessage]: - user_metas = [] + messages = [] - for user_name, user_df in x.items(): + for user_name, user_df in user_dataframes.items(): # See if we have seen this user before if (user_name not in self._rows_per_user): @@ -294,12 +297,22 @@ def _build_user_metadata(self, x: typing.Dict[str, pd.DataFrame]): user_df.index = range(self._rows_per_user[user_name], self._rows_per_user[user_name] + len(user_df)) self._rows_per_user[user_name] += len(user_df) - # Now make a UserMessageMeta with the user name - meta = UserMessageMeta(user_df, user_name) + # If we're in GPU mode we need to convert to cuDF + if not isinstance(user_df, self._df_class): + for col in [col for col in user_df.columns if isinstance(user_df[col].dtype, pd.DatetimeTZDtype)]: + user_df[col] = user_df[col].dt.tz_convert(None) + + user_df = self._df_class(user_df) + + # Now make a message with the user name in metadata + meta = MessageMeta(user_df) + message = ControlMessage() + message.payload(meta) + message.set_metadata("user_id", user_name) - user_metas.append(meta) + messages.append(message) - return user_metas + return messages def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: # The first source just produces filenames @@ -319,10 +332,9 @@ def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) userid_filter=self._userid_filter, repeat_count=self._repeat_count)), ops.map(self._add_derived_features), - # Now group the batch of dataframes into a single df, split by user, and send a single UserMessageMeta + # Now group the batch of dataframes into a single df, split by user, and send a single ControlMessage # per user - ops.map(self._build_user_metadata), - # Finally flatten to single meta + ops.map(self._build_message), ops.flatten()) builder.make_edge(out_node, post_node) diff --git a/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py b/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py index 2c3c400c96..06f0d47b47 100644 --- a/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py +++ b/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py @@ -16,11 +16,11 @@ import mrc -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -39,7 +39,7 @@ @register_stage("from-databricks-deltalake") -class DataBricksDeltaLakeSourceStage(PreallocatorMixin, SingleOutputSource): +class DataBricksDeltaLakeSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage used to load messages from a DeltaLake table. @@ -77,6 +77,10 @@ def __init__(self, self.items_per_page = items_per_page self.offset = 0 + if config.execution_mode == ExecutionMode.GPU: + import cudf + self._cudf = cudf + @property def name(self) -> str: return "from-databricks-deltalake" @@ -104,7 +108,14 @@ def source_generator(self, subscription: mrc.Subscription): str(self.offset), str(self.offset + self.items_per_page + 1)) self.offset += self.items_per_page + 1 - yield MessageMeta(df=cudf.from_pandas(df.toPandas().drop(["_id"], axis=1))) + + df = df.toPandas().drop(["_id"], axis=1) + + if self._config.execution_mode == ExecutionMode.GPU: + df = self._cudf.from_pandas(df) + + yield MessageMeta(df=df) + except Exception as e: logger.error( "Error occurred while reading data from \ diff --git a/python/morpheus/morpheus/stages/input/file_source_stage.py b/python/morpheus/morpheus/stages/input/file_source_stage.py index 675bc5e94b..398c1b126e 100644 --- a/python/morpheus/morpheus/stages/input/file_source_stage.py +++ b/python/morpheus/morpheus/stages/input/file_source_stage.py @@ -19,14 +19,13 @@ import mrc -# pylint: disable=morpheus-incorrect-lib-from-import -from morpheus._lib.messages import MessageMeta as CppMessageMeta from morpheus.cli import register_stage from morpheus.common import FileTypes from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.io.deserializers import read_file_to_df from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -35,7 +34,7 @@ @register_stage("from-file", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER]) -class FileSourceStage(PreallocatorMixin, SingleOutputSource): +class FileSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Load messages from a file. @@ -140,17 +139,14 @@ def _generate_frames(self, subscription: mrc.Subscription) -> typing.Iterable[Me filter_nulls=self._filter_null, filter_null_columns=self._filter_null_columns, parser_kwargs=self._parser_kwargs, - df_type="cudf", + df_type=self.df_type_str, ) for i in range(self._repeat_count): if not subscription.is_subscribed(): break - if (self._build_cpp_node()): - x = CppMessageMeta(df) - else: - x = MessageMeta(df) + x = MessageMeta(df) # If we are looping, copy the object. Do this before we push the object in case it changes if (i + 1 < self._repeat_count): diff --git a/python/morpheus/morpheus/stages/input/http_client_source_stage.py b/python/morpheus/morpheus/stages/input/http_client_source_stage.py index 73e9460627..cc49912467 100644 --- a/python/morpheus/morpheus/stages/input/http_client_source_stage.py +++ b/python/morpheus/morpheus/stages/input/http_client_source_stage.py @@ -21,21 +21,22 @@ import mrc import requests -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.io.utils import get_json_reader from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils import http_utils +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @register_stage("from-http-client", ignore_args=["query_params", "headers", "**request_kwargs"]) -class HttpClientSourceStage(PreallocatorMixin, SingleOutputSource): +class HttpClientSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage that polls a remote HTTP server for incoming data. @@ -82,7 +83,8 @@ class HttpClientSourceStage(PreallocatorMixin, SingleOutputSource): Stops ingesting after emitting `stop_after` records (rows in the dataframe). Useful for testing. Disabled if `0` payload_to_df_fn : callable, default None A callable that takes the HTTP payload bytes as the first argument and the `lines` parameter is passed in as - the second argument and returns a cudf.DataFrame. If unset cudf.read_json is used. + the second argument and returns a DataFrame. If unset `cudf.read_json` is used in GPU mode and + `pandas.read_json` in CPU mode. **request_kwargs : dict Additional arguments to pass to the `requests.request` function. """ @@ -101,7 +103,7 @@ def __init__(self, max_retries: int = 10, lines: bool = False, stop_after: int = 0, - payload_to_df_fn: typing.Callable[[bytes, bool], cudf.DataFrame] = None, + payload_to_df_fn: typing.Callable[[bytes, bool], DataFrameType] = None, **request_kwargs): super().__init__(config) self._url = http_utils.prepare_url(url) @@ -139,9 +141,14 @@ def __init__(self, self._stop_after = stop_after self._lines = lines - self._payload_to_df_fn = payload_to_df_fn self._requst_kwargs = request_kwargs + if payload_to_df_fn is not None: + self._payload_to_df_fn = payload_to_df_fn + else: + reader = get_json_reader(self._config.execution_mode) + self._payload_to_df_fn = lambda payload, lines: reader(payload, lines=lines) + @property def name(self) -> str: """Unique name of the stage""" @@ -154,16 +161,13 @@ def supports_cpp_node(self) -> bool: def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(MessageMeta) - def _parse_response(self, response: requests.Response) -> typing.Union[cudf.DataFrame, None]: + def _parse_response(self, response: requests.Response) -> typing.Union[DataFrameType, None]: """ Returns a DataFrame parsed from the response payload. If the response payload is empty, then `None` is returned. """ payload = response.content - if self._payload_to_df_fn is not None: - return self._payload_to_df_fn(payload, self._lines) - - return cudf.read_json(payload, lines=self._lines, engine='cudf') + return self._payload_to_df_fn(payload, self._lines) def _generate_frames(self, subscription: mrc.Subscription) -> typing.Iterator[MessageMeta]: # Running counter of the number of messages emitted by this source diff --git a/python/morpheus/morpheus/stages/input/http_server_source_stage.py b/python/morpheus/morpheus/stages/input/http_server_source_stage.py index 8bf22084cf..0459a458e4 100644 --- a/python/morpheus/morpheus/stages/input/http_server_source_stage.py +++ b/python/morpheus/morpheus/stages/input/http_server_source_stage.py @@ -23,11 +23,11 @@ import mrc -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.io.utils import get_json_reader from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -35,6 +35,7 @@ from morpheus.utils.http_utils import HttpParseResponse from morpheus.utils.http_utils import MimeTypes from morpheus.utils.producer_consumer_queue import Closed +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @@ -43,7 +44,7 @@ @register_stage("from-http") -class HttpServerSourceStage(PreallocatorMixin, SingleOutputSource): +class HttpServerSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage that starts an HTTP server and listens for incoming requests on a specified endpoint. @@ -81,7 +82,7 @@ class HttpServerSourceStage(PreallocatorMixin, SingleOutputSource): Stops ingesting after emitting `stop_after` records (rows in the dataframe). Useful for testing. Disabled if `0` payload_to_df_fn : callable, default None A callable that takes the HTTP payload string as the first argument and the `lines` parameter is passed in as - the second argument and returns a cudf.DataFrame. When supplied, the C++ implementation of this stage is + the second argument and returns a DataFrame. When supplied, the C++ implementation of this stage is disabled, and the Python impl is used. """ @@ -104,7 +105,7 @@ def __init__(self, request_timeout_secs: int = 30, lines: bool = False, stop_after: int = 0, - payload_to_df_fn: typing.Callable[[str, bool], cudf.DataFrame] = None): + payload_to_df_fn: typing.Callable[[str, bool], DataFrameType] = None): super().__init__(config) self._bind_address = bind_address self._port = port @@ -123,9 +124,11 @@ def __init__(self, self._request_timeout_secs = request_timeout_secs self._lines = lines self._stop_after = stop_after - self._payload_to_df_fn = payload_to_df_fn self._http_server = None + # Leave this as None so we can check if it's set later + self._payload_to_df_fn = payload_to_df_fn + # These are only used when C++ mode is disabled self._queue = None self._queue_size = 0 @@ -163,12 +166,7 @@ def stop(self): def _parse_payload(self, payload: str) -> HttpParseResponse: try: - if self._payload_to_df_fn is not None: - df = self._payload_to_df_fn(payload, self._lines) - else: - # engine='cudf' is needed when lines=False to avoid using pandas - df = cudf.read_json(StringIO(initial_value=payload), lines=self._lines, engine='cudf') - + df = self._payload_to_df_fn(payload, self._lines) except Exception as e: err_msg = "Error occurred converting HTTP payload to Dataframe" logger.error("%s: %s", err_msg, e) @@ -270,6 +268,10 @@ def _generate_frames(self, subscription: mrc.Subscription) -> typing.Iterator[Me if self._stop_after > 0 and self._records_emitted >= self._stop_after: self._processing = False + def _set_default_payload_to_df_fn(self): + reader = get_json_reader(self._config.execution_mode) + self._payload_to_df_fn = lambda payload, lines: reader(StringIO(initial_value=payload), lines=lines) + def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: if self._build_cpp_node() and self._payload_to_df_fn is None: import morpheus._lib.stages as _stages @@ -289,6 +291,9 @@ def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: lines=self._lines, stop_after=self._stop_after) else: + if self._payload_to_df_fn is None: + self._set_default_payload_to_df_fn() + node = builder.make_source(self.unique_name, self._generate_frames) return node diff --git a/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py b/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py index c9630549d6..4139ab41ed 100644 --- a/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py +++ b/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py @@ -18,6 +18,7 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -27,7 +28,7 @@ | typing.Callable[[], typing.Iterable[typing.Any]]) -class InMemoryDataGenStage(SingleOutputSource): +class InMemoryDataGenStage(GpuAndCpuMixin, SingleOutputSource): """ Source stage that generates data in-memory using a provided iterable or generator function. @@ -54,7 +55,7 @@ def compute_schema(self, schema: StageSchema): # Set the output schema based on the OutputDataType schema.output_schema.set_type(self._output_data_type) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: diff --git a/python/morpheus/morpheus/stages/input/in_memory_source_stage.py b/python/morpheus/morpheus/stages/input/in_memory_source_stage.py index c977845eaa..726a1e40b1 100644 --- a/python/morpheus/morpheus/stages/input/in_memory_source_stage.py +++ b/python/morpheus/morpheus/stages/input/in_memory_source_stage.py @@ -16,13 +16,12 @@ import mrc -import cudf - from morpheus.config import Config from morpheus.messages import MessageMeta from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.stage_schema import StageSchema from morpheus.stages.input.in_memory_data_generation_stage import InMemoryDataGenStage +from morpheus.utils.type_aliases import DataFrameType class InMemorySourceStage(PreallocatorMixin, InMemoryDataGenStage): @@ -33,13 +32,13 @@ class InMemorySourceStage(PreallocatorMixin, InMemoryDataGenStage): ---------- c : `morpheus.config.Config` Pipeline configuration instance. - dataframes : typing.List[cudf.DataFrame] + dataframes : list[DataFrameType] List of dataframes to emit wrapped in `MessageMeta` instances in order. repeat : int, default = 1, min = 1 Repeats the input dataset multiple times. Useful to extend small datasets for debugging. """ - def __init__(self, c: Config, dataframes: typing.List[cudf.DataFrame], repeat: int = 1): + def __init__(self, c: Config, dataframes: list[DataFrameType], repeat: int = 1): # Prepare a generator function based on the provided dataframes and repeat count self._dataframes = dataframes self._repeat_count = repeat diff --git a/python/morpheus/morpheus/stages/input/kafka_source_stage.py b/python/morpheus/morpheus/stages/input/kafka_source_stage.py index 275418b72b..8770bd91ea 100644 --- a/python/morpheus/morpheus/stages/input/kafka_source_stage.py +++ b/python/morpheus/morpheus/stages/input/kafka_source_stage.py @@ -22,17 +22,17 @@ import mrc import pandas as pd -import cudf - -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.config import auto_determine_bootstrap +from morpheus.io.utils import get_json_reader from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @@ -45,7 +45,7 @@ class AutoOffsetReset(Enum): @register_stage("from-kafka", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER]) -class KafkaSourceStage(PreallocatorMixin, SingleOutputSource): +class KafkaSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Load messages from a Kafka cluster. @@ -128,6 +128,9 @@ def __init__(self, self._poll_interval = pd.Timedelta(poll_interval).total_seconds() self._started = False + # Defined lated if in CPU mode + self._json_reader: typing.Callable[..., DataFrameType] = None + self._records_emitted = 0 self._num_messages = 0 @@ -155,7 +158,7 @@ def _process_batch(self, consumer, batch): df = None try: buffer.seek(0) - df = cudf.io.read_json(buffer, engine='cudf', lines=True, orient='records') + df = self._json_reader(buffer, lines=True, orient='records') except Exception as e: logger.error("Error parsing payload into a dataframe : %s", e) finally: @@ -226,6 +229,7 @@ def _source_generator(self, subscription: mrc.Subscription): def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: if (self._build_cpp_node()): + import morpheus._lib.stages as _stages source = _stages.KafkaSourceStage(builder, self.unique_name, self._max_batch_size, @@ -241,6 +245,7 @@ def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: # multiple threads source.launch_options.pe_count = self._max_concurrent else: + self._json_reader = get_json_reader(self._config.execution_mode) source = builder.make_source(self.unique_name, self._source_generator) return source diff --git a/python/morpheus/morpheus/stages/input/rss_source_stage.py b/python/morpheus/morpheus/stages/input/rss_source_stage.py index c9d9d01ac3..a5dc473189 100644 --- a/python/morpheus/morpheus/stages/input/rss_source_stage.py +++ b/python/morpheus/morpheus/stages/input/rss_source_stage.py @@ -20,6 +20,7 @@ from morpheus.config import Config from morpheus.controllers.rss_controller import RSSController from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -28,7 +29,7 @@ @register_stage("from-rss") -class RSSSourceStage(PreallocatorMixin, SingleOutputSource): +class RSSSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Load RSS feed items into a DataFrame. @@ -82,13 +83,14 @@ def __init__(self, strip_markup=strip_markup, stop_after=stop_after, interval_secs=interval_secs, - should_stop_fn=self.is_stop_requested) + should_stop_fn=self.is_stop_requested, + df_type=self.df_type_str) @property def name(self) -> str: return "from-rss" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False def compute_schema(self, schema: StageSchema): diff --git a/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py b/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py index 86ae3dc6ce..ab7ec49f40 100644 --- a/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py +++ b/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py @@ -21,14 +21,13 @@ import pandas as pd -import cudf - from morpheus.config import Config from morpheus.io.deserializers import read_file_to_df from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage from morpheus.utils import compare_df as compare_df_module from morpheus.utils import concat_df from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type class CompareDataFrameStage(InMemorySinkStage): @@ -74,8 +73,6 @@ def __init__(self, if isinstance(compare_df, str): compare_df = read_file_to_df(compare_df, df_type='pandas') - elif isinstance(compare_df, cudf.DataFrame): - compare_df = compare_df.to_pandas() elif isinstance(compare_df, list): tmp_dfs = [] for item in compare_df: @@ -83,6 +80,8 @@ def __init__(self, tmp_dfs.append(tmp_df) compare_df = pd.concat(tmp_dfs) compare_df.reset_index(inplace=True, drop=True) + elif is_cudf_type(compare_df): + compare_df = compare_df.to_pandas() self._compare_df = compare_df diff --git a/python/morpheus/morpheus/stages/output/http_client_sink_stage.py b/python/morpheus/morpheus/stages/output/http_client_sink_stage.py index a9cb872b4c..083a97b9ce 100644 --- a/python/morpheus/morpheus/stages/output/http_client_sink_stage.py +++ b/python/morpheus/morpheus/stages/output/http_client_sink_stage.py @@ -25,6 +25,7 @@ from morpheus.config import Config from morpheus.io import serializers from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils import http_utils @@ -36,7 +37,7 @@ @register_stage("to-http", ignore_args=["query_params", "headers", "df_to_request_kwargs_fn", "**request_kwargs"]) -class HttpClientSinkStage(PassThruTypeMixin, SinglePortStage): +class HttpClientSinkStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Write all messages to an HTTP endpoint. diff --git a/python/morpheus/morpheus/stages/output/http_server_sink_stage.py b/python/morpheus/morpheus/stages/output/http_server_sink_stage.py index dcf59d3864..448285f018 100644 --- a/python/morpheus/morpheus/stages/output/http_server_sink_stage.py +++ b/python/morpheus/morpheus/stages/output/http_server_sink_stage.py @@ -22,15 +22,13 @@ from io import StringIO import mrc -import pandas as pd from mrc.core import operators as ops -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.io import serializers from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.http_utils import HTTPMethod @@ -42,7 +40,7 @@ @register_stage("to-http-server", ignore_args=["df_serializer_fn"]) -class HttpServerSinkStage(PassThruTypeMixin, SinglePortStage): +class HttpServerSinkStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Sink stage that starts an HTTP server and listens for incoming requests on a specified endpoint. @@ -116,6 +114,8 @@ def __init__(self, self._df_serializer_fn = df_serializer_fn or self._default_df_serializer + self._df_pkg = self.get_df_pkg() + # FiberQueue doesn't have a way to check the size, nor does it have a way to check if it's empty without # attempting to perform a read. We'll keep track of the size ourselves. self._queue = queue.Queue(maxsize=max_queue_size or config.edge_buffer_size) @@ -201,10 +201,10 @@ def _request_handler(self, _: str) -> HttpParseResponse: body=err_msg) if (len(data_frames) > 0): - df = data_frames[0] if len(data_frames) > 1: - cat_fn = pd.concat if isinstance(df, pd.DataFrame) else cudf.concat - df = cat_fn(data_frames) + df = self._df_pkg.concat(data_frames) + else: + df = data_frames[0] return HttpParseResponse(status_code=HTTPStatus.OK.value, content_type=self._content_type, diff --git a/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py b/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py index ea2998ea3c..f81a61c169 100644 --- a/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py +++ b/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py @@ -18,11 +18,12 @@ import mrc.core.operators as ops from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class InMemorySinkStage(PassThruTypeMixin, SinglePortStage): +class InMemorySinkStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Collects incoming messages into a list that can be accessed after the pipeline is complete. Useful for testing. diff --git a/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py b/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py index 6b98ffeb92..53d028d987 100644 --- a/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py @@ -19,8 +19,6 @@ import pandas as pd from mrc.core import operators as ops -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import MessageMeta @@ -97,8 +95,9 @@ def write_to_deltalake(meta: MessageMeta): convert cudf to spark dataframe """ df = meta.copy_dataframe() - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): df = df.to_pandas() + schema = self._extract_schema_from_pandas_dataframe(df) spark_df = self.spark.createDataFrame(df, schema=schema) spark_df.write \ diff --git a/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py b/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py index eede6926e8..f26948cf6a 100644 --- a/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py @@ -18,10 +18,9 @@ import mrc import mrc.core.operators as ops +import pandas as pd import yaml -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.controllers.elasticsearch_controller import ElasticsearchController @@ -110,7 +109,7 @@ def on_data(meta: MessageMeta) -> MessageMeta: self._controller.refresh_client() df = meta.copy_dataframe() - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): df = df.to_pandas() logger.debug("Converted cudf of size: %s to pandas dataframe.", len(df)) diff --git a/python/morpheus/morpheus/stages/output/write_to_file_stage.py b/python/morpheus/morpheus/stages/output/write_to_file_stage.py index 46b7e5cec6..9f3298bc61 100644 --- a/python/morpheus/morpheus/stages/output/write_to_file_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_file_stage.py @@ -18,18 +18,18 @@ import mrc import mrc.core.operators as ops -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.common import FileTypes from morpheus.config import Config from morpheus.controllers.write_to_file_controller import WriteToFileController from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @register_stage("to-file", rename_options={"include_index_col": "--include-index-col"}) -class WriteToFileStage(PassThruTypeMixin, SinglePortStage): +class WriteToFileStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Write all messages to a file. @@ -92,6 +92,7 @@ def supports_cpp_node(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: # Sink to file if (self._build_cpp_node()): + import morpheus._lib.stages as _stages to_file_node = _stages.WriteToFileStage(builder, self.unique_name, self._controller.output_file, diff --git a/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py b/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py index 3546a14563..ad7954f977 100644 --- a/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py @@ -24,6 +24,7 @@ from morpheus.config import Config from morpheus.io import serializers from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -31,7 +32,7 @@ @register_stage("to-kafka") -class WriteToKafkaStage(PassThruTypeMixin, SinglePortStage): +class WriteToKafkaStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Write all messages to a Kafka cluster. diff --git a/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py b/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py index 5937a2077b..e4ab126cdd 100644 --- a/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py @@ -63,7 +63,7 @@ def __init__(self, def name(self) -> str: return "add-class" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: # Enable support by default return True diff --git a/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py b/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py index 75c796ee3a..bb55d1f3b9 100644 --- a/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py +++ b/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py @@ -23,13 +23,14 @@ from morpheus.common import TypeId from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage logger = logging.getLogger(__name__) -class AddScoresStageBase(PassThruTypeMixin, SinglePortStage): +class AddScoresStageBase(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Base class for the `AddScoresStage` and `AddClassificationStage` diff --git a/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py b/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py index 925d0deb73..45fc41ef56 100644 --- a/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py @@ -18,12 +18,12 @@ import mrc from mrc.core import operators as ops -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.common import FilterSource from morpheus.config import Config from morpheus.controllers.filter_detections_controller import FilterDetectionsController from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema @@ -31,7 +31,7 @@ @register_stage("filter") -class FilterDetectionsStage(SinglePortStage): +class FilterDetectionsStage(GpuAndCpuMixin, SinglePortStage): """ Filter message by a classification threshold. @@ -113,6 +113,7 @@ def supports_cpp_node(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: if self._build_cpp_node(): + import morpheus._lib.stages as _stages node = _stages.FilterDetectionsStage(builder, self.unique_name, self._controller.threshold, diff --git a/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py b/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py index 7e62870138..25f02d4ea0 100644 --- a/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py @@ -27,22 +27,22 @@ import websockets.legacy.server from websockets.server import serve -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.producer_consumer_queue import AsyncIOProducerConsumerQueue from morpheus.utils.producer_consumer_queue import Closed +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @register_stage("gen-viz", modes=[PipelineModes.NLP], command_args={"deprecated": True}) -class GenerateVizFramesStage(PassThruTypeMixin, SinglePortStage): +class GenerateVizFramesStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Write out visualization DataFrames. @@ -81,6 +81,8 @@ def __init__(self, self._server_task: asyncio.Task = None self._server_close_event: asyncio.Event = None + self._df_class: type[DataFrameType] = self.get_df_class() + @property def name(self) -> str: return "gen_viz" @@ -142,7 +144,7 @@ def indent_data(y: str): except Exception: return y - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): df = df.to_pandas() df["data"] = df["data"].apply(indent_data) @@ -278,7 +280,7 @@ def write_batch(msg: ControlMessage): columns = ["timestamp", "src_ip", "dest_ip", "secret_keys", "data"] df = msg.payload().get_data(columns) - out_df = cudf.DataFrame() + out_df = self._df_class() out_df["dt"] = (df["timestamp"] - time0).astype(np.int32) out_df["src"] = df["src_ip"].str.ip_to_int().astype(np.uint32) diff --git a/python/morpheus/morpheus/stages/postprocess/serialize_stage.py b/python/morpheus/morpheus/stages/postprocess/serialize_stage.py index ba61d9274b..47afb85082 100644 --- a/python/morpheus/morpheus/stages/postprocess/serialize_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/serialize_stage.py @@ -19,12 +19,12 @@ import mrc from mrc.core import operators as ops -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.controllers.serialize_controller import SerializeController from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema @@ -32,7 +32,7 @@ @register_stage("serialize") -class SerializeStage(SinglePortStage): +class SerializeStage(GpuAndCpuMixin, SinglePortStage): """ Includes & excludes columns from messages. @@ -91,6 +91,7 @@ def supports_cpp_node(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: if (self._build_cpp_node()): + import morpheus._lib.stages as _stages node = _stages.SerializeStage(builder, self.unique_name, self._controller.include_columns or [], diff --git a/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py b/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py index 493c57a6df..5d7e5d5a67 100644 --- a/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py @@ -350,7 +350,9 @@ def _calc_timeseries(self, x: ControlMessage, is_complete: bool): # Save this message in the pending queue self._pending_messages.append(x) - new_timedata = x.payload().get_data([self._timestamp_col]).to_pandas() + new_timedata = x.payload().get_data([self._timestamp_col]) + if not isinstance(new_timedata, pd.DataFrame): + new_timedata = new_timedata.to_pandas() # Save this message event times in the event list. Ensure the values are always sorted self._timeseries_data = pd.concat([self._timeseries_data, new_timedata]).sort_index() diff --git a/python/morpheus/morpheus/stages/postprocess/validation_stage.py b/python/morpheus/morpheus/stages/postprocess/validation_stage.py index 99da57b36d..e39c814136 100644 --- a/python/morpheus/morpheus/stages/postprocess/validation_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/validation_stage.py @@ -30,7 +30,7 @@ @register_stage("validate") -class ValidationStage(CompareDataFrameStage): +class ValidationStage(CompareDataFrameStage): # pylint: disable=too-many-ancestors """ Validate pipeline output for testing. diff --git a/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py b/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py index 8c0eaf17fe..7605be4f79 100644 --- a/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py @@ -25,6 +25,7 @@ from morpheus.messages import MessageMeta from morpheus.modules.preprocess.deserialize import DeserializeLoaderFactory from morpheus.pipeline.control_message_stage import ControlMessageStage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.stage_schema import StageSchema logger = logging.getLogger(__name__) @@ -33,7 +34,7 @@ @register_stage("deserialize", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER], ignore_args=["task_type", "task_payload"]) -class DeserializeStage(ControlMessageStage): +class DeserializeStage(GpuAndCpuMixin, ControlMessageStage): """ Messages are logically partitioned based on the pipeline config's `pipeline_batch_size` parameter. @@ -77,15 +78,6 @@ def __init__(self, if ((self._task_type is None) != (self._task_payload is None)): raise ValueError("Both `task_type` and `task_payload` must be specified if either is specified.") - self._module_config = { - "ensure_sliceable_index": self._ensure_sliceable_index, - "task_type": self._task_type, - "task_payload": self._task_payload, - "batch_size": self._batch_size, - "max_concurrency": self._max_concurrent, - "should_log_timestamp": self._should_log_timestamps - } - @property def name(self) -> str: return "deserialize" @@ -116,8 +108,17 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> builder.make_edge(input_node, out_node) else: + module_config = { + "ensure_sliceable_index": self._ensure_sliceable_index, + "task_type": self._task_type, + "task_payload": self._task_payload, + "batch_size": self._batch_size, + "max_concurrency": self._max_concurrent, + "should_log_timestamp": self._should_log_timestamps + } + module_loader = DeserializeLoaderFactory.get_instance(module_name=f"deserialize_{self.unique_name}", - module_config=self._module_config) + module_config=module_config) module = module_loader.load(builder=builder) mod_in_node = module.input_port("input") diff --git a/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py b/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py index 697cce089a..7926aeb8d4 100644 --- a/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py @@ -12,21 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import typing - import mrc from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage from morpheus.config import Config -from morpheus.config import PipelineModes from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -@register_stage("dropna", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER]) -class DropNullStage(PassThruTypeMixin, SinglePortStage): +@register_stage("dropna") +class DropNullStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Drop null data entries from a DataFrame. @@ -51,27 +49,26 @@ def __init__(self, c: Config, column: str): def name(self) -> str: return "dropna" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Accepted input types for this stage are returned. Returns ------- - typing.Tuple + tuple Accepted input types. """ return (MessageMeta, ) - def supports_cpp_node(self): - # Enable support by default + def supports_cpp_node(self) -> bool: return False def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: - def on_next(x: MessageMeta): - - y = MessageMeta(x.df[~x.df[self._column].isna()]) + def on_next(msg: MessageMeta): + df = msg.copy_dataframe() + y = MessageMeta(df[~df[self._column].isna()]) return y diff --git a/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py b/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py index d69504dd27..e31f151068 100644 --- a/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py @@ -17,11 +17,12 @@ from morpheus.config import Config from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class GroupByColumnStage(PassThruTypeMixin, SinglePortStage): +class GroupByColumnStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Group the incoming message by a column in the DataFrame. diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py index c7c639eddd..8cd18cfc87 100644 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py @@ -17,13 +17,12 @@ from functools import partial import cupy as cp -import mrc -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage logger = logging.getLogger(__name__) @@ -79,7 +78,7 @@ def pre_process_batch(msg: ControlMessage, fea_len: int, feature_columns: typing morpheus.messages.ControlMessage """ - meta_df = msg.payload().get_data(msg.payload().df.columns.intersection(feature_columns)) + meta_df = msg.payload().get_data(msg.payload().df.columns.intersection(feature_columns)).to_pandas() autoencoder = msg.get_metadata("model") scores_mean = msg.get_metadata("train_scores_mean") @@ -101,13 +100,10 @@ def pre_process_batch(msg: ControlMessage, fea_len: int, feature_columns: typing msg.set_metadata("model", autoencoder) msg.set_metadata("train_scores_mean", scores_mean) msg.set_metadata("train_scores_std", scores_std) - msg.tensors(_messages.TensorMemory(count=count, tensors={"input": inputs, "seq_ids": seg_ids})) + msg.tensors(TensorMemory(count=count, tensors={"input": inputs, "seq_ids": seg_ids})) return msg def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: return partial(PreprocessAEStage.pre_process_batch, fea_len=self._fea_length, feature_columns=self._feature_columns) - - def _get_preprocess_node(self, builder: mrc.Builder): - raise NotImplementedError("No C++ node for AE") diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py index 0a0d36b97e..d8f5debf28 100644 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py @@ -13,7 +13,6 @@ # limitations under the License. import typing -from abc import abstractmethod import mrc from mrc.core import operators as ops @@ -38,7 +37,6 @@ class PreprocessBaseStage(ControlMessageStage): def __init__(self, c: Config): super().__init__(c) - self._preprocess_fn = None self._should_log_timestamps = True def accepted_types(self) -> typing.Tuple: @@ -49,24 +47,27 @@ def accepted_types(self) -> typing.Tuple: return (ControlMessage, ) def compute_schema(self, schema: StageSchema): - self._preprocess_fn = self._get_preprocess_fn() schema.output_schema.set_type(ControlMessage) - @abstractmethod def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: - pass + """ + This method should be implemented by any subclasses with a Python implementation. + """ + raise NotImplementedError("No Python implementation provided by this stage") - @abstractmethod def _get_preprocess_node(self, builder: mrc.Builder) -> mrc.SegmentObject: - pass + """ + This method should be implemented by any subclasses with a C++ implementation. + """ + raise NotImplementedError("No C++ implementation provided by this stage") def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: - assert self._preprocess_fn is not None, "Preprocess function not set" if self._build_cpp_node(): node = self._get_preprocess_node(builder) node.launch_options.pe_count = self._config.num_threads else: - node = builder.make_node(self.unique_name, ops.map(self._preprocess_fn)) + preprocess_fn = self._get_preprocess_fn() + node = builder.make_node(self.unique_name, ops.map(preprocess_fn)) builder.make_edge(input_node, node) diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py index e113958c4c..b3e6895ae0 100644 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py @@ -13,22 +13,12 @@ # limitations under the License. import logging -import typing -from functools import partial -import cupy as cp import mrc -import numpy as np -import pandas as pd -import cudf - -import morpheus._lib.messages as _messages -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage logger = logging.getLogger(__name__) @@ -59,62 +49,9 @@ def __init__(self, c: Config): def name(self) -> str: return "preprocess-fil" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return True - @staticmethod - def pre_process_batch(msg: ControlMessage, fea_len: int, fea_cols: typing.List[str]) -> ControlMessage: - """ - For FIL category usecases, this function performs pre-processing. - - Parameters - ---------- - msg : `morpheus.messages.ControlMessage` - Input rows received from Deserialized stage. - fea_len : int - Number features are being used in the inference. - fea_cols : typing.Tuple[str] - List of columns that are used as features. - - Returns - ------- - `morpheus.messages.ControlMessage` - - """ - try: - df: cudf.DataFrame = msg.payload().get_data(fea_cols) - except KeyError: - logger.exception("Requested feature columns does not exist in the dataframe.", exc_info=True) - raise - - # Extract just the numbers from each feature col. Not great to operate on x.meta.df here but the operations will - # only happen once. - for col in fea_cols: - if (df[col].dtype == np.dtype(str) or df[col].dtype == np.dtype(object)): - # If the column is a string, parse the number - df[col] = df[col].str.extract(r"(\d+)", expand=False).astype("float32") - elif (df[col].dtype != np.float32): - # Convert to float32 - df[col] = df[col].astype("float32") - - if (isinstance(df, pd.DataFrame)): - df = cudf.from_pandas(df) - - # Convert the dataframe to cupy the same way cuml does - data = cp.asarray(df.to_cupy()) - - count = data.shape[0] - - seg_ids = cp.zeros((count, 3), dtype=cp.uint32) - seg_ids[:, 0] = cp.arange(0, count, dtype=cp.uint32) - seg_ids[:, 2] = fea_len - 1 - - # We need the C++ impl of TensorMemory until #1646 is resolved - msg.tensors(_messages.TensorMemory(count=count, tensors={"input__0": data, "seq_ids": seg_ids})) - return msg - - def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: - return partial(PreprocessFILStage.pre_process_batch, fea_len=self._fea_length, fea_cols=self.features) - def _get_preprocess_node(self, builder: mrc.Builder): + import morpheus._lib.stages as _stages return _stages.PreprocessFILStage(builder, self.unique_name, self.features) diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py index 1f92d97b8f..3a85af54cb 100644 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py @@ -12,62 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -import base64 -import json import logging -import typing -from functools import partial -import cupy as cp import mrc -import numpy as np -import cudf - -import morpheus._lib.messages as _messages -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.cli.utils import MorpheusRelativePath from morpheus.cli.utils import get_package_relative_file from morpheus.config import Config from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage -from morpheus.utils.cudf_subword_helper import tokenize_text_series logger = logging.getLogger(__name__) -def cupyarray_to_base64(cupy_array): - array_bytes = cupy_array.get().tobytes() - array_shape = cupy_array.shape - array_dtype = str(cupy_array.dtype) - - # Create a dictionary to store bytes, shape, and dtype - encoded_dict = {'bytes': base64.b64encode(array_bytes).decode("utf-8"), 'shape': array_shape, 'dtype': array_dtype} - - # Convert dictionary to JSON string for storage - return json.dumps(encoded_dict) - - -def base64_to_cupyarray(base64_str): - # Convert JSON string back to dictionary - encoded_dict = json.loads(base64_str) - - # Extract bytes, shape, and dtype - array_bytes = base64.b64decode(encoded_dict['bytes']) - array_shape = tuple(encoded_dict['shape']) - array_dtype = encoded_dict['dtype'] - - # Convert bytes back to a NumPy array and reshape - np_array = np.frombuffer(array_bytes, dtype=array_dtype).reshape(array_shape) - - # Convert NumPy array to CuPy array - cp_array = cp.array(np_array) - - return cp_array - - @register_stage( "preprocess", modes=[PipelineModes.NLP], @@ -133,64 +91,11 @@ def __init__(self, def name(self) -> str: return "preprocess-nlp" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return True - @staticmethod - def pre_process_batch(message: ControlMessage, - vocab_hash_file: str, - do_lower_case: bool, - seq_len: int, - stride: int, - truncation: bool, - add_special_tokens: bool, - column: str) -> ControlMessage: - """ - For NLP category use cases, this function performs pre-processing. - - [parameters are the same as the original function] - - Returns - ------- - `morpheus.messages.ControlMessage` - - """ - with message.payload().mutable_dataframe() as mdf: - text_series = cudf.Series(mdf[column]) - - tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file, - do_lower_case=do_lower_case, - text_ser=text_series, - seq_len=seq_len, - stride=stride, - truncation=truncation, - add_special_tokens=add_special_tokens) - - del text_series - - # We need the C++ impl of TensorMemory until #1646 is resolved - message.tensors( - _messages.TensorMemory(count=tokenized.input_ids.shape[0], - tensors={ - "input_ids": tokenized.input_ids, - "input_mask": tokenized.input_mask, - "seq_ids": tokenized.segment_ids - })) - - message.set_metadata("inference_memory_params", {"inference_type": "nlp"}) - return message - - def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: - return partial(PreprocessNLPStage.pre_process_batch, - vocab_hash_file=self._vocab_hash_file, - do_lower_case=self._do_lower_case, - stride=self._stride, - seq_len=self._seq_length, - truncation=self._truncation, - add_special_tokens=self._add_special_tokens, - column=self._column) - def _get_preprocess_node(self, builder: mrc.Builder): + import morpheus._lib.stages as _stages return _stages.PreprocessNLPStage(builder, self.unique_name, self._vocab_hash_file, diff --git a/python/morpheus/morpheus/stages/preprocess/train_ae_stage.py b/python/morpheus/morpheus/stages/preprocess/train_ae_stage.py index ae246c4015..c765aa7f20 100644 --- a/python/morpheus/morpheus/stages/preprocess/train_ae_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/train_ae_stage.py @@ -16,7 +16,6 @@ import importlib import logging import pathlib -import typing import dill import mrc @@ -27,10 +26,9 @@ from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import UserMessageMeta from morpheus.models.dfencoder import AutoEncoder -from morpheus.pipeline.control_message_stage import ControlMessageStage -from morpheus.pipeline.stage_schema import StageSchema +from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin +from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.seed import manual_seed logger = logging.getLogger(__name__) @@ -123,7 +121,7 @@ def train(self, df: pd.DataFrame) -> AutoEncoder: @register_stage("train-ae", modes=[PipelineModes.AE]) -class TrainAEStage(ControlMessageStage): +class TrainAEStage(PassThruTypeMixin, SinglePortStage): """ Train an Autoencoder model on incoming data. @@ -196,34 +194,33 @@ def __init__(self, self._pretrained_model: AutoEncoder = None # Per user model data - self._user_models: typing.Dict[str, _UserModelManager] = {} + self._user_models: dict[str, _UserModelManager] = {} @property def name(self) -> str: return "train-ae" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Returns accepted input types for this stage. """ - return (UserMessageMeta, ) - - def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(ControlMessage) + return (ControlMessage, ) def supports_cpp_node(self): return False - def _get_per_user_model(self, x: UserMessageMeta): + def _get_per_user_model(self, msg: ControlMessage): model = None train_scores_mean = None train_scores_std = None user_model = None - if x.user_id in self._user_models: - user_model = self._user_models[x.user_id] + user_id = msg.get_metadata("user_id") + + if user_id in self._user_models: + user_model = self._user_models[user_id] elif self._use_generic_model and "generic" in self._user_models.keys(): user_model = self._user_models["generic"] @@ -234,17 +231,21 @@ def _get_per_user_model(self, x: UserMessageMeta): return model, train_scores_mean, train_scores_std - def _train_model(self, x: UserMessageMeta) -> list[ControlMessage]: + def _train_model(self, msg: ControlMessage) -> list[ControlMessage]: + user_id = msg.get_metadata("user_id") - if (x.user_id not in self._user_models): - self._user_models[x.user_id] = _UserModelManager(self._config, - x.user_id, - False, - self._train_epochs, - self._train_max_history, - self._seed) + if (user_id not in self._user_models): + self._user_models[user_id] = _UserModelManager(self._config, + user_id, + False, + self._train_epochs, + self._train_max_history, + self._seed) - return self._user_models[x.user_id].train(x.df) + with msg.payload().mutable_dataframe() as cdf: + pdf = cdf.to_pandas() + + return self._user_models[user_id].train(pdf) def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: get_model_fn = None @@ -312,29 +313,23 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> else: get_model_fn = self._train_model - def on_next(x: UserMessageMeta): - - model, scores_mean, scores_std = get_model_fn(x) + def on_next(full_message: ControlMessage): - # cuDF does not yet support timezone-aware datetimes - # Remove timezone information from pd.DatetimeTZDtype columns - with x.mutable_dataframe() as df: - for col in [col for col in df.columns if isinstance(df[col].dtype, pd.DatetimeTZDtype)]: - df[col] = df[col].dt.tz_convert(None) + model, scores_mean, scores_std = get_model_fn(full_message) - full_message = ControlMessage() - full_message.payload(x) full_message.set_metadata("model", model) full_message.set_metadata("train_scores_mean", scores_mean) full_message.set_metadata("train_scores_std", scores_std) + # cuDF does not yet support timezone-aware datetimes + # Remove timezone information from pd.DatetimeTZDtype columns + meta = full_message.payload() to_send = [] # Now split into batches - for i in range(0, full_message.payload().count, self._batch_size): + for i in range(0, meta.count, self._batch_size): output_message = ControlMessage(full_message) - output_message.payload(full_message.payload().get_slice( - i, min(i + self._batch_size, full_message.payload().count))) + output_message.payload(meta.get_slice(i, min(i + self._batch_size, meta.count))) to_send.append(output_message) return to_send diff --git a/python/morpheus/morpheus/utils/column_info.py b/python/morpheus/morpheus/utils/column_info.py index 75119320e4..6fe8a1cb8f 100644 --- a/python/morpheus/morpheus/utils/column_info.py +++ b/python/morpheus/morpheus/utils/column_info.py @@ -22,7 +22,8 @@ import pandas as pd -import cudf +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type logger = logging.getLogger(f"morpheus.{__name__}") @@ -30,7 +31,7 @@ # Note(Devin): Proxying this for backwards compatibility. Had to move the primary definition to avoid circular imports. -def process_dataframe(df_in: typing.Union[pd.DataFrame, cudf.DataFrame], input_schema) -> pd.DataFrame: +def process_dataframe(df_in: DataFrameType, input_schema) -> pd.DataFrame: """ Processes a dataframe according to the given schema. @@ -83,7 +84,7 @@ def create_increment_col(df: pd.DataFrame, """ # Ensure we are pandas for this - if (isinstance(df, cudf.DataFrame)): + if (not isinstance(df, pd.DataFrame)): df = df.to_pandas() time_col = df[timestamp_column].fillna(pd.to_datetime(DEFAULT_DATE)) @@ -595,16 +596,16 @@ class PreparedDFInfo: Attributes ---------- - df : typing.Union[pd.DataFrame, cudf.DataFrame] + df : DataFrameType The prepared DataFrame. - columns_to_preserve : typing.List[str] + columns_to_preserve : list[str] A list of column names that are to be preserved. """ - df: typing.Union[pd.DataFrame, cudf.DataFrame] - columns_to_preserve: typing.List[str] + df: DataFrameType + columns_to_preserve: list[str] -def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], +def _json_flatten(df_input: DataFrameType, input_columns: dict[str, str], json_cols: list[str], preserve_re: re.Pattern = None): @@ -614,7 +615,7 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], Parameters ---------- - df_input : typing.Union[pd.DataFrame, cudf.DataFrame] + df_input : DataFrameType DataFrame to process. input_columns : dict[str, str] The final input columns that are needed for processing. All other columns will be removed @@ -625,7 +626,7 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], Returns ------- - typing.Union[pd.DataFrame, cudf.DataFrame] + DataFrameType The processed DataFrame. """ @@ -640,10 +641,9 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], # Check if we even have any JSON columns to flatten if (not df_input.columns.intersection(json_cols).empty): - convert_to_cudf = False + is_cudf = is_cudf_type(df_input) - if (isinstance(df_input, cudf.DataFrame)): - convert_to_cudf = True + if (is_cudf): df_input = df_input.to_pandas() json_normalized = [] @@ -672,7 +672,8 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], # Combine the original DataFrame with the normalized JSON columns df_input = pd.concat([df_input[columns_to_keep]] + json_normalized, axis=1) - if (convert_to_cudf): + if (is_cudf): + import cudf df_input = cudf.from_pandas(df_input).reset_index(drop=True) # Remove all columns that are not in the input columns list. Ensure the correct types diff --git a/python/morpheus/morpheus/utils/concat_df.py b/python/morpheus/morpheus/utils/concat_df.py index f709d46c10..1956f83730 100644 --- a/python/morpheus/morpheus/utils/concat_df.py +++ b/python/morpheus/morpheus/utils/concat_df.py @@ -14,13 +14,12 @@ import pandas as pd -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.utils.type_utils import is_cudf_type -def concat_dataframes(messages: list[ControlMessage] | list[MessageMeta]) -> pd.DataFrame: +def concat_dataframes(messages: list[ControlMessage | MessageMeta]) -> pd.DataFrame: """ Concatinate the DataFrame associated with the collected messages into a single Pandas DataFrame. @@ -43,7 +42,7 @@ def concat_dataframes(messages: list[ControlMessage] | list[MessageMeta]) -> pd. else: raise ValueError("Invalid message type") - if isinstance(df, cudf.DataFrame): + if is_cudf_type(df): df = df.to_pandas() all_meta.append(df) diff --git a/python/morpheus/morpheus/utils/module_utils.py b/python/morpheus/morpheus/utils/module_utils.py index f1aca63334..a250f1a650 100644 --- a/python/morpheus/morpheus/utils/module_utils.py +++ b/python/morpheus/morpheus/utils/module_utils.py @@ -21,12 +21,10 @@ from typing import Type import mrc -import pandas as pd from pydantic import BaseModel -import cudf - from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_pkg_from_obj logger = logging.getLogger(__name__) @@ -190,9 +188,9 @@ def merge_dictionaries(primary_dict, secondary_dict): } -def to_period_approximation(data_df: DataFrameType, period: str): +def to_period_approximation(data_df: DataFrameType, period: str) -> DataFrameType: """ - This function converts a cudf dataframe to a period approximation. + This function converts a dataframe to a period approximation. Parameters ---------- @@ -203,7 +201,7 @@ def to_period_approximation(data_df: DataFrameType, period: str): Returns ------- - cudf.DataFrame + DataFrame Period approximation of the input cudf/pandas dataframe. """ @@ -216,8 +214,8 @@ def to_period_approximation(data_df: DataFrameType, period: str): strptime_format = period_to_strptime[period] - df_mod = cudf if isinstance(data_df, cudf.DataFrame) else pd - data_df["period"] = df_mod.to_datetime(data_df["ts"].dt.strftime(strptime_format) + '-1', + df_pkg = get_df_pkg_from_obj(data_df) + data_df["period"] = df_pkg.to_datetime(data_df["ts"].dt.strftime(strptime_format) + '-1', format=f"{strptime_format}-%w") return data_df diff --git a/python/morpheus/morpheus/utils/schema_transforms.py b/python/morpheus/morpheus/utils/schema_transforms.py index 1cf8b65183..162a2064db 100644 --- a/python/morpheus/morpheus/utils/schema_transforms.py +++ b/python/morpheus/morpheus/utils/schema_transforms.py @@ -17,9 +17,12 @@ import pandas as pd -import cudf - from morpheus.utils.column_info import DataFrameInputSchema +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type + +if typing.TYPE_CHECKING: + import cudf logger = logging.getLogger(__name__) @@ -34,16 +37,16 @@ def process_dataframe( @typing.overload def process_dataframe( - df_in: cudf.DataFrame, + df_in: "cudf.DataFrame", input_schema: DataFrameInputSchema, -) -> cudf.DataFrame: +) -> "cudf.DataFrame": ... def process_dataframe( - df_in: typing.Union[pd.DataFrame, cudf.DataFrame], + df_in: DataFrameType, input_schema: DataFrameInputSchema, -) -> typing.Union[pd.DataFrame, cudf.DataFrame]: +) -> DataFrameType: """ Applies column transformations to the input dataframe as defined by the `input_schema`. @@ -72,10 +75,9 @@ def process_dataframe( output_df = pd.DataFrame() - convert_to_cudf = False - if (isinstance(df_in, cudf.DataFrame)): + is_cudf = is_cudf_type(df_in) + if (is_cudf): df_in = df_in.to_pandas() - convert_to_cudf = True # Iterate over the column info for ci in input_schema.column_info: @@ -94,7 +96,8 @@ def process_dataframe( output_df[match_columns] = df_in[match_columns] - if (convert_to_cudf): + if (is_cudf): + import cudf return cudf.from_pandas(output_df) return output_df diff --git a/python/morpheus/morpheus/utils/seed.py b/python/morpheus/morpheus/utils/seed.py index b016731fa6..d64cd6a6a4 100644 --- a/python/morpheus/morpheus/utils/seed.py +++ b/python/morpheus/morpheus/utils/seed.py @@ -20,17 +20,26 @@ import torch -def manual_seed(seed: int): +def manual_seed(seed: int, cpu_only: bool = False): """ - Manually see the random number generators for the stdlib, PyTorch, NumPy and CuPy + Manually see the random number generators for the Python standard lib, PyTorch, NumPy and CuPy + + Parameters + ---------- + seed : int + The seed value to use + cpu_only : bool, default = False + When set to True, CuPy and CUDA specific PyTorch settings are not set. """ random.seed(seed) np.random.seed(seed) - cp.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) # the "all" refers to all GPUs - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True + if not cpu_only: + cp.random.seed(seed) + + torch.cuda.manual_seed_all(seed) # the "all" refers to all GPUs + + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True diff --git a/python/morpheus/morpheus/utils/type_aliases.py b/python/morpheus/morpheus/utils/type_aliases.py index a3e7ddeed0..5e977918c7 100644 --- a/python/morpheus/morpheus/utils/type_aliases.py +++ b/python/morpheus/morpheus/utils/type_aliases.py @@ -15,9 +15,16 @@ import typing -import pandas as pd +if typing.TYPE_CHECKING: + import cupy + import numpy + import pandas -import cudf + import cudf -DataFrameType = typing.Union[pd.DataFrame, cudf.DataFrame] -SeriesType = typing.Union[pd.Series, cudf.Series] +DataFrameModule = typing.Literal["cudf", "pandas"] +DataFrameType = typing.Union["pandas.DataFrame", "cudf.DataFrame"] +SeriesType = typing.Union["pandas.Series", "cudf.Series"] + +NDArrayType = typing.Union["numpy.ndarray", "cupy.ndarray"] +TensorMapType = dict[str, NDArrayType] diff --git a/python/morpheus/morpheus/utils/type_utils.py b/python/morpheus/morpheus/utils/type_utils.py index a3aefdde8d..7dd629b687 100644 --- a/python/morpheus/morpheus/utils/type_utils.py +++ b/python/morpheus/morpheus/utils/type_utils.py @@ -17,6 +17,14 @@ import typing from collections import defaultdict +import numpy as np +import pandas as pd + +from morpheus.config import CppConfig +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType + # pylint: disable=invalid-name T_co = typing.TypeVar("T_co", covariant=True) @@ -162,3 +170,129 @@ def get_full_qualname(klass: type) -> str: if module == '__builtin__': return klass.__qualname__ return module + '.' + klass.__qualname__ + + +def df_type_str_to_exec_mode(df_type_str: DataFrameModule) -> ExecutionMode: + """ + Return the appropriate execution mode based on the DataFrame type string. + """ + if df_type_str == "cudf": + return ExecutionMode.GPU + if df_type_str == "pandas": + return ExecutionMode.CPU + + valid_values = ", ".join(typing.get_args(DataFrameModule)) + raise ValueError(f"Invalid DataFrame type string: {df_type_str}, valid values are: {valid_values}") + + +def exec_mode_to_df_type_str(execution_mode: ExecutionMode) -> DataFrameModule: + if execution_mode == ExecutionMode.GPU: + return "cudf" + + return "pandas" + + +def cpp_mode_to_exec_mode() -> ExecutionMode: + if CppConfig.get_should_use_cpp(): + return ExecutionMode.GPU + return ExecutionMode.CPU + + +def df_type_str_to_pkg(df_type_str: DataFrameModule) -> types.ModuleType: + """ + Return the appropriate DataFrame package based on the DataFrame type string. + """ + if df_type_str == "cudf": + import cudf + return cudf + if df_type_str == "pandas": + return pd + + valid_values = ", ".join(typing.get_args(DataFrameModule)) + raise ValueError(f"Invalid DataFrame type string: {df_type_str}, valid values are: {valid_values}") + + +@typing.overload +def get_df_pkg(selector: DataFrameModule = None) -> types.ModuleType: + ... + + +@typing.overload +def get_df_pkg(selector: ExecutionMode = None) -> types.ModuleType: + ... + + +def get_df_pkg(selector: ExecutionMode | DataFrameModule = None) -> types.ModuleType: + """ + Return the appropriate DataFrame package based on the execution mode. + """ + if selector is None: + execution_mode = cpp_mode_to_exec_mode() + elif not isinstance(selector, ExecutionMode): + execution_mode = df_type_str_to_exec_mode(selector) + else: + execution_mode = selector + + if execution_mode == ExecutionMode.GPU: + import cudf + return cudf + + return pd + + +@typing.overload +def get_df_class(selector: DataFrameModule = None) -> type[DataFrameType]: + ... + + +@typing.overload +def get_df_class(selector: ExecutionMode = None) -> type[DataFrameType]: + ... + + +def get_df_class(selector: ExecutionMode | DataFrameModule = None) -> type[DataFrameType]: + """ + Return the appropriate DataFrame class based on the execution mode. + """ + df_pkg = get_df_pkg(selector) + return df_pkg.DataFrame + + +def is_cudf_type(obj: typing.Any) -> bool: + """ + Check if a given object (DataFrame, Series, RangeIndex etc...) is a cuDF type. + """ + return "cudf" in str(type(obj)) + + +def get_df_pkg_from_obj(obj: typing.Any) -> types.ModuleType: + """ + Return the appropriate DataFrame package based on the DataFrame object. + """ + if is_cudf_type(obj): + import cudf + return cudf + + return pd + + +def is_dataframe(obj: typing.Any) -> bool: + """ + Check if a given object is a pandas or cudf DataFrame. + """ + df_pkg = get_df_pkg_from_obj(obj) + return isinstance(obj, df_pkg.DataFrame) + + +def get_array_pkg(execution_mode: ExecutionMode = None) -> types.ModuleType: + """ + Return the appropriate array package (CuPy for GPU, NumPy for CPU) based on the execution mode. + """ + if execution_mode is None: + execution_mode = cpp_mode_to_exec_mode() + + if execution_mode == ExecutionMode.GPU: + import cupy + return cupy + + return np diff --git a/python/morpheus_dfp/morpheus_dfp/messages/__init__.py b/python/morpheus_dfp/morpheus_dfp/messages/__init__.py deleted file mode 100644 index 66061e580b..0000000000 --- a/python/morpheus_dfp/morpheus_dfp/messages/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py b/python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py deleted file mode 100644 index 49b8c98ba9..0000000000 --- a/python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import dataclasses -import logging - -import pandas as pd - -from morpheus.messages.message_meta import MessageMeta - -logger = logging.getLogger(__name__) - - -@dataclasses.dataclass(init=False) -class DFPMessageMeta(MessageMeta, cpp_class=None): - """ - This class extends MessageMeta to also hold userid corresponding to batched metadata. - - Parameters - ---------- - df : pandas.DataFrame - Input rows in dataframe. - user_id : str - User id. - - """ - user_id: str - - def __init__(self, df: pd.DataFrame, user_id: str) -> None: - super().__init__(df) - self.user_id = user_id diff --git a/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py b/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py index 2a66a04d9d..c710d09f9f 100644 --- a/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py +++ b/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py @@ -22,9 +22,9 @@ import cudf from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.model_cache import ModelCache from morpheus_dfp.utils.model_cache import ModelManager from morpheus_dfp.utils.module_ids import DFP_INFERENCE @@ -111,11 +111,11 @@ def process_task(control_message: ControlMessage) -> ControlMessage: output_df = cudf.concat([payload.df, results_df[results_cols]], axis=1) # Create an output message to allow setting meta + meta = MessageMeta(output_df) + meta.set_data('model_version', f"{model_cache.reg_model_name}:{model_cache.reg_model_version}") output_message = ControlMessage() - output_message.payload(DFPMessageMeta(output_df, user_id=user_id)) - - output_message.payload().set_data('model_version', - f"{model_cache.reg_model_name}:{model_cache.reg_model_version}") + output_message.payload(meta) + output_message.set_metadata("user_id", user_id) if logger.isEnabledFor(logging.DEBUG): load_model_duration = (post_model_time - start_time) * 1000.0 diff --git a/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py b/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py index 234855d9f1..6bc41d1d09 100644 --- a/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py +++ b/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py @@ -21,10 +21,10 @@ import cudf from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.models.dfencoder import AutoEncoder from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.module_ids import DFP_TRAINING logger = logging.getLogger(f"morpheus.{__name__}") @@ -97,10 +97,9 @@ def on_data(control_message: ControlMessage) -> list[ControlMessage]: model.fit(train_df, epochs=epochs, validation_data=validation_df, run_validation=run_validation) logger.debug("Training AE model for user: '%s'... Complete.", user_id) - dfp_mm = DFPMessageMeta(cudf.from_pandas(final_df), user_id=user_id) - + meta = MessageMeta(cudf.from_pandas(final_df)) output_message = ControlMessage() - output_message.payload(dfp_mm) + output_message.payload(meta) output_message.set_metadata("user_id", user_id) output_message.set_metadata("model", model) output_message.set_metadata("train_scores_mean", 0.0) diff --git a/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py b/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py index f9233c6f89..7ef67ec88c 100644 --- a/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py +++ b/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py @@ -22,11 +22,13 @@ import pandas as pd from mrc.core import operators as ops +import cudf + from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.cached_user_window import CachedUserWindow from morpheus_dfp.utils.logging_timer import log_time @@ -88,7 +90,7 @@ def supports_cpp_node(self): def accepted_types(self) -> typing.Tuple: """Input types accepted by this stage.""" - return (DFPMessageMeta, ) + return (ControlMessage, ) def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(ControlMessage) @@ -115,13 +117,13 @@ def _get_user_cache(self, user_id: str) -> typing.Generator[CachedUserWindow, No # # When it returns, make sure to save # user_cache.save() - def _build_window(self, message: DFPMessageMeta) -> ControlMessage: + def _build_window(self, message: ControlMessage) -> ControlMessage: - user_id = message.user_id + user_id = message.get_metadata('user_id') with self._get_user_cache(user_id) as user_cache: - incoming_df = message.get_data() + incoming_df = message.payload().get_data().to_pandas() # existing_df = user_cache.df if (not user_cache.append_dataframe(incoming_df=incoming_df)): @@ -161,12 +163,12 @@ def _build_window(self, message: DFPMessageMeta) -> ControlMessage: # Otherwise return a new message response_msg = ControlMessage() - response_msg.payload(DFPMessageMeta(df=train_df, user_id=user_id)) + response_msg.payload(MessageMeta(df=cudf.DataFrame(train_df))) response_msg.set_metadata("user_id", user_id) return response_msg - def on_data(self, message: DFPMessageMeta) -> ControlMessage: + def on_data(self, message: ControlMessage) -> ControlMessage: """ Emits a new message containing the rolling window for the user if and only if the history requirments are met, returns `None` otherwise. @@ -180,10 +182,10 @@ def on_data(self, message: DFPMessageMeta) -> ControlMessage: log_info.set_log( ("Rolling window complete for %s in {duration:0.2f} ms. " "Input: %s rows from %s to %s. Output: %s rows from %s to %s"), - message.user_id, - len(message.df), - message.df[self._config.ae.timestamp_column_name].min(), - message.df[self._config.ae.timestamp_column_name].max(), + message.get_metadata('user_id'), + len(message.payload().df), + message.payload().df[self._config.ae.timestamp_column_name].min(), + message.payload().df[self._config.ae.timestamp_column_name].max(), result.payload().count, result.payload().get_data(self._config.ae.timestamp_column_name).min(), result.payload().get_data(self._config.ae.timestamp_column_name).max(), diff --git a/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py b/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py index 2a40b4521e..e88b3dfc49 100644 --- a/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py +++ b/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py @@ -14,7 +14,6 @@ """Split messages into individual users and generic messages.""" import logging -import typing import mrc import numpy as np @@ -24,10 +23,11 @@ import cudf from morpheus.config import Config +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils.type_aliases import DataFrameType -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.logging_timer import log_time logger = logging.getLogger(f"morpheus.{__name__}") @@ -59,8 +59,8 @@ def __init__(self, c: Config, include_generic: bool, include_individual: bool, - skip_users: typing.List[str] = None, - only_users: typing.List[str] = None): + skip_users: list[str] = None, + only_users: list[str] = None): super().__init__(c) self._include_generic = include_generic @@ -69,25 +69,25 @@ def __init__(self, self._only_users = only_users if only_users is not None else [] # Map of user ids to total number of messages. Keeps indexes monotonic and increasing per user - self._user_index_map: typing.Dict[str, int] = {} + self._user_index_map: dict[str, int] = {} @property def name(self) -> str: """Stage name.""" return "dfp-split-users" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: """Whether this stage supports a C++ node.""" return False - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """Input types accepted by this stage.""" return (cudf.DataFrame, pd.DataFrame) def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(DFPMessageMeta) + schema.output_schema.set_type(ControlMessage) - def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: + def extract_users(self, message: DataFrameType) -> list[ControlMessage]: """ Extract users from a message, splitting the incoming data into unique messages on a per-user basis, and potentially filtering data based on the user. @@ -101,7 +101,7 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: # Convert to pandas because cudf is slow at this message = message.to_pandas() - split_dataframes: typing.Dict[str, pd.DataFrame] = {} + split_dataframes: dict[str, pd.DataFrame] = {} # If we are skipping users, do that here if (len(self._skip_users) > 0): @@ -123,7 +123,8 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: user_df in message.groupby(self._config.ae.userid_column_name, sort=False) }) - output_messages: typing.List[DFPMessageMeta] = [] + output_messages: list[ControlMessage] = [] + rows_per_user: list[int] = [] for user_id in sorted(split_dataframes.keys()): @@ -138,7 +139,12 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: user_df.index = range(current_user_count, current_user_count + len(user_df)) self._user_index_map[user_id] = current_user_count + len(user_df) - output_messages.append(DFPMessageMeta(df=user_df, user_id=user_id)) + rows_per_user.append(len(user_df)) + meta = MessageMeta(cudf.DataFrame.from_pandas(user_df)) + cm_msg = ControlMessage() + cm_msg.payload(meta) + cm_msg.set_metadata("user_id", user_id) + output_messages.append(cm_msg) # logger.debug("Emitting dataframe for user '%s'. Start: %s, End: %s, Count: %s", # user, @@ -146,8 +152,6 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: # df_user[self._config.ae.timestamp_column_name].max(), # df_user[self._config.ae.timestamp_column_name].count()) - rows_per_user = [len(x.df) for x in output_messages] - if (len(output_messages) > 0): log_info.set_log( ("Batch split users complete. Input: %s rows from %s to %s. " diff --git a/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py b/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py index 9e3e2d904c..036e2c90eb 100644 --- a/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py +++ b/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py @@ -17,7 +17,6 @@ from morpheus.cli.utils import get_package_relative_file from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.messages import ControlMessage from morpheus.utils.file_utils import load_labels_file from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE @@ -172,12 +171,8 @@ def generate_ae_config(source: str, timestamp_column_name: str, pipeline_batch_size: int = 0, edge_buffer_size: int = 0, - use_cpp: bool = False, num_threads: int = len(os.sched_getaffinity(0))): config = Config() - - CppConfig.set_should_use_cpp(use_cpp) - config.num_threads = num_threads if pipeline_batch_size > 0: diff --git a/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp b/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp index 10ead7fa5d..51323b8ef6 100644 --- a/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp +++ b/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp @@ -33,7 +33,6 @@ #include "morpheus/messages/control.hpp" // IWYU pragma: keep #include "morpheus/pybind11/json.hpp" // IWYU pragma: keep -#include "morpheus/utilities/cudf_util.hpp" #include "morpheus/utilities/json_types.hpp" #include "morpheus/version.hpp" @@ -70,9 +69,6 @@ PYBIND11_MODULE(llm, _module) )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - // Import the mrc coro module mrc::pymrc::import(_module, "mrc.core.coro"); diff --git a/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py b/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py index 3c4e07c9ec..c9528f3c78 100644 --- a/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py +++ b/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py @@ -21,13 +21,13 @@ from mrc.core import operators as ops from pydantic import ValidationError -import cudf - from morpheus.messages import ControlMessage from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_ids import WRITE_TO_VECTOR_DB from morpheus.utils.module_utils import ModuleLoaderFactory from morpheus.utils.module_utils import register_module +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_pkg_from_obj from morpheus_llm.modules.schemas.write_to_vector_db_schema import WriteToVDBSchema from morpheus_llm.service.vdb.milvus_client import DATA_TYPE_MAP from morpheus_llm.service.vdb.utils import VectorDBServiceFactory @@ -70,7 +70,7 @@ def preprocess_vdb_resources(service, recreate: bool, resource_schemas: dict): class AccumulationStats: msg_count: int last_insert_time: float - data: list[cudf.DataFrame] + data: list[DataFrameType] @register_module(WRITE_TO_VECTOR_DB, MORPHEUS_MODULE_NAMESPACE) @@ -144,12 +144,12 @@ def on_completed(): for key, accum_stats in accumulator_dict.items(): try: if accum_stats.data: - merged_df = cudf.concat(accum_stats.data) + df_pkg = get_df_pkg_from_obj(accum_stats.data[0]) + merged_df = df_pkg.concat(accum_stats.data) service.insert_dataframe(name=key, df=merged_df) final_df_references.append(accum_stats.data) except Exception as e: logger.error("Unable to upload dataframe entries to vector database: %s", e) - raise finally: # Close vector database service connection service.close() @@ -175,9 +175,6 @@ def on_data(msg: ControlMessage): df, msg_resource_target = extract_df(msg) if df is not None and not df.empty: - if (not isinstance(df, cudf.DataFrame)): - df = cudf.DataFrame(df) - df_size = len(df) current_time = time.time() @@ -202,7 +199,8 @@ def on_data(msg: ControlMessage): (current_time - accum_stats.last_insert_time) >= write_time_interval): if accum_stats.data: - merged_df = cudf.concat(accum_stats.data) + df_pkg = get_df_pkg_from_obj(accum_stats.data[0]) + merged_df = df_pkg.concat(accum_stats.data) # pylint: disable=not-a-mapping service.insert_dataframe(name=key, df=merged_df, **resource_kwargs) diff --git a/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py b/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py index 0197f3071d..8a31ed8085 100644 --- a/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py +++ b/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py @@ -17,10 +17,7 @@ import time import typing -import pandas as pd - -import cudf - +from morpheus.utils.type_aliases import DataFrameType from morpheus_llm.error import IMPORT_ERROR_MESSAGE from morpheus_llm.service.vdb.vector_db_service import VectorDBResourceService from morpheus_llm.service.vdb.vector_db_service import VectorDBService @@ -81,13 +78,13 @@ def insert(self, data: list[list] | list[dict], **kwargs) -> dict: """ raise NotImplementedError("Insert operation is not supported in FAISS") - def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwargs) -> dict: + def insert_dataframe(self, df: DataFrameType, **kwargs) -> dict: """ Insert a dataframe entires into the vector database. Parameters ---------- - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted into the collection. **kwargs Extra keyword arguments specific to the vector database implementation. @@ -368,11 +365,7 @@ def create(self, name: str, overwrite: bool = False, **kwargs): """ raise NotImplementedError("create operation is not supported in FAISS") - def create_from_dataframe(self, - name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], - overwrite: bool = False, - **kwargs) -> None: + def create_from_dataframe(self, name: str, df: DataFrameType, overwrite: bool = False, **kwargs) -> None: """ Create collections in the vector database. @@ -380,7 +373,7 @@ def create_from_dataframe(self, ---------- name : str Name of the collection. - df : Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType The dataframe to create the collection from. overwrite : bool, optional Whether to overwrite the collection if it already exists. Default is False. @@ -416,8 +409,7 @@ def insert(self, name: str, data: list[list] | list[dict], **kwargs) -> dict[str raise NotImplementedError("create_from_dataframe operation is not supported in FAISS") - def insert_dataframe(self, name: str, df: typing.Union[cudf.DataFrame, pd.DataFrame], - **kwargs) -> dict[str, typing.Any]: + def insert_dataframe(self, name: str, df: DataFrameType, **kwargs) -> dict[str, typing.Any]: """ Converts dataframe to rows and insert to the vector database. @@ -425,7 +417,7 @@ def insert_dataframe(self, name: str, df: typing.Union[cudf.DataFrame, pd.DataFr ---------- name : str Name of the collection to be inserted. - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted in the collection. **kwargs Additional keyword arguments containing collection configuration. diff --git a/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py b/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py index 71df614b23..5c3f020aea 100644 --- a/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py +++ b/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py @@ -20,11 +20,10 @@ import typing from functools import wraps -import cudf - from morpheus.io.utils import cudf_string_cols_exceed_max_bytes from morpheus.io.utils import truncate_string_cols_by_bytes from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type from morpheus_llm.error import IMPORT_ERROR_MESSAGE from morpheus_llm.service.vdb.vector_db_service import VectorDBResourceService from morpheus_llm.service.vdb.vector_db_service import VectorDBService @@ -327,7 +326,7 @@ def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) - logger.info("Skipped checking 'None' in the field: %s, with datatype: %s", field_name, dtype) needs_truncate = self._truncate_long_strings - if needs_truncate and isinstance(df, cudf.DataFrame): + if needs_truncate and is_cudf_type(df): # Cudf specific optimization, we can avoid a costly call to truncate_string_cols_by_bytes if all of the # string columns are already below the max length needs_truncate = cudf_string_cols_exceed_max_bytes(df, self._fields_max_length) @@ -336,7 +335,7 @@ def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) - column_names = [field.name for field in self._fields if not field.auto_id] collection_df = df[column_names] - if isinstance(collection_df, cudf.DataFrame): + if is_cudf_type(collection_df): collection_df = collection_df.to_pandas() if needs_truncate: @@ -728,7 +727,7 @@ def _build_schema_conf(self, df: DataFrameType) -> list[dict]: # Always add a primary key fields.append({"name": "pk", "dtype": pymilvus.DataType.INT64, "is_primary": True, "auto_id": True}) - if isinstance(df, cudf.DataFrame): + if is_cudf_type(df): df = df.to_pandas() # Loop over all of the columns of the first row and build the schema diff --git a/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py b/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py index 8f2d346f55..bbf0439028 100644 --- a/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py +++ b/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py @@ -17,9 +17,7 @@ from abc import ABC from abc import abstractmethod -import pandas as pd - -import cudf +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @@ -50,13 +48,13 @@ def insert(self, data: list[list] | list[dict], **kwargs: dict[str, typing.Any]) pass @abstractmethod - def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwargs: dict[str, typing.Any]) -> dict: + def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict: """ Insert a dataframe into the vector database. Parameters ---------- - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted into the resource. **kwargs : dict[str, typing.Any] Extra keyword arguments specific to the vector database implementation. @@ -241,10 +239,7 @@ def insert(self, name: str, data: list[list] | list[dict], **kwargs: dict[str, t pass @abstractmethod - def insert_dataframe(self, - name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], - **kwargs: dict[str, typing.Any]) -> dict: + def insert_dataframe(self, name: str, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict: """ Converts dataframe to rows and insert into the vector database resource. @@ -252,7 +247,7 @@ def insert_dataframe(self, ---------- name : str Name of the resource to be inserted. - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted. **kwargs : dict[str, typing.Any] Additional keyword arguments containing collection configuration. @@ -391,7 +386,7 @@ def create(self, name: str, overwrite: bool = False, **kwargs: dict[str, typing. @abstractmethod def create_from_dataframe(self, name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], + df: DataFrameType, overwrite: bool = False, **kwargs: dict[str, typing.Any]) -> None: """ @@ -401,7 +396,7 @@ def create_from_dataframe(self, ---------- name : str Name of the resource. - df : Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType The dataframe to create the resource from. overwrite : bool, optional Whether to overwrite the resource if it already exists. Default is False. diff --git a/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py b/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py index 86c0717964..f16442ac59 100644 --- a/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py +++ b/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py @@ -12,13 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import logging +import types import typing import mrc +from mrc.core import operators as ops -import morpheus_llm._lib.llm as _llm from morpheus.config import Config +from morpheus.config import CppConfig from morpheus.messages import ControlMessage from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -65,11 +68,28 @@ def supports_cpp_node(self): """Indicates whether this stage supports a C++ node.""" return True - def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: + def _cast_control_message(self, message: ControlMessage, *, cpp_messages_lib: types.ModuleType) -> ControlMessage: + """ + LLMEngineStage does not contain a Python implementation, however it is capable of running in Python/cpu-only + mode. This method is needed to cast the Python ControlMessage to a C++ ControlMessage. + + This is different than casting from the Python bindings for the C++ ControlMessage to a C++ ControlMessage. + """ + return cpp_messages_lib.ControlMessage(message) + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: + import morpheus_llm._lib.llm as _llm node = _llm.LLMEngineStage(builder, self.unique_name, self._engine) node.launch_options.pe_count = 1 + if not CppConfig.get_should_use_cpp(): + import morpheus._lib.messages as _messages + cast_fn = functools.partial(self._cast_control_message, cpp_messages_lib=_messages) + pre_node = builder.make_node(f"{self.unique_name}-pre-cast", ops.map(cast_fn)) + builder.make_edge(input_node, pre_node) + + input_node = pre_node + builder.make_edge(input_node, node) return node diff --git a/tests/_utils/dataset_manager.py b/tests/_utils/dataset_manager.py index c6aeb09892..40202b4025 100644 --- a/tests/_utils/dataset_manager.py +++ b/tests/_utils/dataset_manager.py @@ -29,7 +29,9 @@ from _utils import assert_results from morpheus.io.deserializers import read_file_to_df from morpheus.utils import compare_df +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType class DatasetManager: @@ -38,19 +40,19 @@ class DatasetManager: Parameters ---------- - df_type : typing.Literal['cudf', 'pandas'] + df_type : DataFrameTypeStr Type of DataFrame to return unless otherwise explicitly specified. """ - __df_cache: typing.Dict[typing.Tuple[typing.Literal['cudf', 'pandas'], str], DataFrameType] = {} + __df_cache: dict[tuple[DataFrameModule, str], DataFrameType] = {} # Values in `__instances` are instances of `DatasetLoader` - __instances: typing.Dict[typing.Literal['cudf', 'pandas'], typing.Any] = {} + __instances: dict[DataFrameModule, "DatasetManager"] = {} # Explicitly using __new__ instead of of an __init__ to implement this as a singleton for each dataframe type. # Initialization is also being performed here instead of an __init__ method as an __init__ method would be re-run # the __init__ on the singleton instance for each cache hit. - def __new__(cls, df_type: typing.Literal['cudf', 'pandas']): + def __new__(cls, df_type: DataFrameModule): """Returns the singleton instance of `DatasetManager` for the specified `df_type`.""" try: return cls.__instances[df_type] @@ -61,7 +63,7 @@ def __new__(cls, df_type: typing.Literal['cudf', 'pandas']): return instance @staticmethod - def get_alt_df_type(df_type: typing.Literal['cudf', 'pandas']) -> typing.Literal['cudf', 'pandas']: + def get_alt_df_type(df_type: DataFrameModule) -> DataFrameModule: """Returns the other possible df type.""" return 'cudf' if df_type == 'pandas' else 'pandas' @@ -71,7 +73,7 @@ def clear(self): def get_df(self, file_path: str, - df_type: typing.Literal['cudf', 'pandas'] = None, + df_type: DataFrameModule = None, no_cache: bool = False, **reader_kwargs) -> DataFrameType: """ @@ -123,9 +125,7 @@ def get_df(self, return df.copy(deep=True) - def __getitem__( - self, item: typing.Union[str, typing.Tuple[str], typing.Tuple[str, typing.Literal['cudf', - 'pandas']]]) -> DataFrameType: + def __getitem__(self, item: str | tuple[str] | tuple[str, DataFrameModule]) -> DataFrameType: """Implements `__getitem__` to allow for fetching DataFrames using the `[]` operator.""" if not isinstance(item, tuple): item = (item, ) @@ -172,7 +172,7 @@ def repeat(df: DataFrameType, repeat_count: int = 2, reset_index: bool = True) - return repeated_df @staticmethod - def replace_index(df: DataFrameType, replace_ids: typing.Dict[int, int]) -> DataFrameType: + def replace_index(df: DataFrameType, replace_ids: dict[int, int]) -> DataFrameType: """Return a new DataFrame's where we replace some index values with others.""" return df.rename(index=replace_ids) @@ -192,7 +192,7 @@ def dup_index(cls, df: DataFrameType, count: int = 1) -> DataFrameType: return cls.replace_index(df, replace_dict) @staticmethod - def _value_as_pandas(val: typing.Union[pd.DataFrame, cdf.DataFrame, cdf.Series], assert_is_pandas=True): + def _value_as_pandas(val: DataFrameType | SeriesType, assert_is_pandas=True): if (isinstance(val, (cdf.DataFrame, cdf.Series))): return val.to_pandas() @@ -202,7 +202,15 @@ def _value_as_pandas(val: typing.Union[pd.DataFrame, cdf.DataFrame, cdf.Series], return val @classmethod - def df_equal(cls, df_to_check: typing.Union[pd.DataFrame, cdf.DataFrame], val_to_check: typing.Any): + def _value_as_pandas_df(cls, val: DataFrameType | SeriesType, assert_is_pandas=True): + pval = cls._value_as_pandas(val, assert_is_pandas=assert_is_pandas) + if isinstance(pval, pd.Series): + pval = pval.to_frame() + + return pval + + @classmethod + def df_equal(cls, df_to_check: DataFrameType, val_to_check: typing.Any): """ Compare a DataFrame against a validation dataset which can either be a DataFrame, Series or CuPy array. Returns True if they are equal. @@ -224,7 +232,7 @@ def df_equal(cls, df_to_check: typing.Union[pd.DataFrame, cdf.DataFrame], val_to @classmethod def assert_df_equal(cls, - df_to_check: typing.Union[pd.DataFrame, cdf.DataFrame], + df_to_check: DataFrameType, val_to_check: typing.Any, assert_msg="Dataframes are not equal."): """ @@ -234,20 +242,14 @@ def assert_df_equal(cls, assert cls.df_equal(df_to_check=df_to_check, val_to_check=val_to_check), assert_msg @classmethod - def compare_df(cls, - dfa: typing.Union[pd.DataFrame, cdf.DataFrame], - dfb: typing.Union[pd.DataFrame, cdf.DataFrame], - **compare_args): + def compare_df(cls, dfa: DataFrameType, dfb: DataFrameType, **compare_args): """Wrapper for `morpheus.utils.compare_df.compare_df`.""" with warnings.catch_warnings(): # Ignore performance warnings from pandas triggered by the comparison warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) - return compare_df.compare_df(cls._value_as_pandas(dfa), cls._value_as_pandas(dfb), **compare_args) + return compare_df.compare_df(cls._value_as_pandas_df(dfa), cls._value_as_pandas_df(dfb), **compare_args) @classmethod - def assert_compare_df(cls, - dfa: typing.Union[pd.DataFrame, cdf.DataFrame], - dfb: typing.Union[pd.DataFrame, cdf.DataFrame], - **compare_args): + def assert_compare_df(cls, dfa: DataFrameType, dfb: DataFrameType, **compare_args): """Convenience method for calling `compare_df` and asserting that the results are equivalent.""" assert_results(cls.compare_df(dfa, dfb, **compare_args)) diff --git a/tests/_utils/inference_worker.py b/tests/_utils/inference_worker.py index 7470e474d5..29af5a0c02 100644 --- a/tests/_utils/inference_worker.py +++ b/tests/_utils/inference_worker.py @@ -26,8 +26,6 @@ class IW(inference_stage.InferenceWorker): """ def calc_output_dims(self, _): - # Intentionally calling the abc empty method for coverage - super().calc_output_dims(_) return (1, 2) def process(self, _: ControlMessage, __: typing.Callable[[TensorMemory], None]): diff --git a/tests/_utils/stages/check_pre_alloc.py b/tests/_utils/stages/check_pre_alloc.py index 0f871a78dd..c8217cc28b 100644 --- a/tests/_utils/stages/check_pre_alloc.py +++ b/tests/_utils/stages/check_pre_alloc.py @@ -21,11 +21,12 @@ from morpheus.common import typeid_to_numpy_str from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class CheckPreAlloc(PassThruTypeMixin, SinglePortStage): +class CheckPreAlloc(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Acts like add-class/add-scores in that it requests a preallocation, the node will assert that the preallocation occurred with the correct type. @@ -38,16 +39,16 @@ def __init__(self, c, probs_type): self._needed_columns.update({label: probs_type for label in c.class_labels}) @property - def name(self): + def name(self) -> str: return "check-prealloc" - def accepted_types(self): + def accepted_types(self) -> tuple: return (ControlMessage, ) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def _check_prealloc(self, msg: ControlMessage): + def _check_prealloc(self, msg: ControlMessage) -> ControlMessage: df = msg.payload().df for label in self._class_labels: assert label in df.columns diff --git a/tests/_utils/stages/control_message_pass_thru.py b/tests/_utils/stages/control_message_pass_thru.py index 659606d38c..cd3ba74e18 100644 --- a/tests/_utils/stages/control_message_pass_thru.py +++ b/tests/_utils/stages/control_message_pass_thru.py @@ -18,23 +18,24 @@ from mrc.core import operators as ops from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class ControlMessagePassThruStage(PassThruTypeMixin, SinglePortStage): +class ControlMessagePassThruStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): @property def name(self) -> str: return "mm-pass-thru" - def accepted_types(self): + def accepted_types(self) -> tuple: return (ControlMessage, ) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def on_data(self, message: ControlMessage): + def on_data(self, message: ControlMessage) -> ControlMessage: return message def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: diff --git a/tests/_utils/stages/conv_msg.py b/tests/_utils/stages/conv_msg.py index edd64f5384..637963e50a 100755 --- a/tests/_utils/stages/conv_msg.py +++ b/tests/_utils/stages/conv_msg.py @@ -15,23 +15,24 @@ import typing -import cupy as cp import mrc -import pandas as pd from mrc.core import operators as ops -import cudf - -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_array_pkg +from morpheus.utils.type_utils import get_df_pkg +from morpheus.utils.type_utils import get_df_pkg_from_obj @register_stage("unittest-conv-msg", ignore_args=["expected_data"]) -class ConvMsg(SinglePortStage): +class ConvMsg(GpuAndCpuMixin, SinglePortStage): """ Simple test stage to convert a ControlMessage to a ControlMessage with probs tensor. Basically a cheap replacement for running an inference stage. @@ -45,17 +46,20 @@ class ConvMsg(SinglePortStage): def __init__(self, c: Config, - expected_data: typing.Union[pd.DataFrame, cudf.DataFrame] = None, + expected_data: DataFrameType = None, columns: typing.List[str] = None, order: str = 'K', probs_type: str = 'f4', empty_probs: bool = False): super().__init__(c) + self._df_pkg = get_df_pkg(c.execution_mode) + self._array_pkg = get_array_pkg(c.execution_mode) + if expected_data is not None: - assert isinstance(expected_data, (pd.DataFrame, cudf.DataFrame)) + assert isinstance(expected_data, self._df_pkg.DataFrame) - self._expected_data = expected_data + self._expected_data: DataFrameType | None = expected_data self._columns = columns self._order = order self._probs_type = probs_type @@ -76,20 +80,21 @@ def supports_cpp_node(self) -> bool: def _conv_message(self, message: ControlMessage) -> ControlMessage: if self._expected_data is not None: - if (isinstance(self._expected_data, cudf.DataFrame)): + df_pkg = get_df_pkg_from_obj(self._expected_data) + if (isinstance(self._expected_data, self._df_pkg.DataFrame)): df = self._expected_data.copy(deep=True) else: - df = cudf.DataFrame(self._expected_data) + df = df_pkg.DataFrame(self._expected_data) else: - df: cudf.DataFrame = message.payload().get_data(self._columns) # type: ignore + df: DataFrameType = message.payload().get_data(self._columns) # type: ignore if self._empty_probs: - probs = cp.zeros([len(df), 3], 'float') + probs = self._array_pkg.zeros([len(df), 3], 'float') else: - probs = cp.array(df.values, dtype=self._probs_type, copy=True, order=self._order) + probs = self._array_pkg.array(df.values, dtype=self._probs_type, copy=True, order=self._order) - message.tensors(_messages.TensorMemory(count=len(probs), tensors={'probs': probs})) + message.tensors(TensorMemory(count=len(probs), tensors={'probs': probs})) return message def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: diff --git a/tests/_utils/stages/dfp_length_checker.py b/tests/_utils/stages/dfp_length_checker.py index 1162a647f7..659c8c81ec 100755 --- a/tests/_utils/stages/dfp_length_checker.py +++ b/tests/_utils/stages/dfp_length_checker.py @@ -21,13 +21,14 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.atomic_integer import AtomicInteger @register_stage("unittest-dfp-length-check") -class DFPLengthChecker(PassThruTypeMixin, SinglePortStage): +class DFPLengthChecker(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Verifies that the incoming MessageMeta classes are of a specific length diff --git a/tests/_utils/stages/error_raiser.py b/tests/_utils/stages/error_raiser.py index 8923229ab2..f3e0d8b5e6 100644 --- a/tests/_utils/stages/error_raiser.py +++ b/tests/_utils/stages/error_raiser.py @@ -19,12 +19,13 @@ from mrc.core import operators as ops from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.atomic_integer import AtomicInteger -class ErrorRaiserStage(PassThruTypeMixin, SinglePortStage): +class ErrorRaiserStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Stage that raises an exception in the on_data method """ diff --git a/tests/_utils/stages/in_memory_multi_source_stage.py b/tests/_utils/stages/in_memory_multi_source_stage.py index 1eb2d46092..7497aff7ed 100644 --- a/tests/_utils/stages/in_memory_multi_source_stage.py +++ b/tests/_utils/stages/in_memory_multi_source_stage.py @@ -18,11 +18,12 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.source_stage import SourceStage from morpheus.pipeline.stage_schema import StageSchema -class InMemoryMultiSourceStage(SourceStage): +class InMemoryMultiSourceStage(GpuAndCpuMixin, SourceStage): """ In memory multi-source stage for testing purposes, accepts a 2d array `data`. The first dimenion represents the number of output ports, and the second represents the data for each port, and diff --git a/tests/_utils/stages/in_memory_source_x_stage.py b/tests/_utils/stages/in_memory_source_x_stage.py index bd1256c07a..229e8e1a3a 100644 --- a/tests/_utils/stages/in_memory_source_x_stage.py +++ b/tests/_utils/stages/in_memory_source_x_stage.py @@ -18,11 +18,12 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema -class InMemSourceXStage(SingleOutputSource): +class InMemSourceXStage(GpuAndCpuMixin, SingleOutputSource): """ InMemorySourceStage subclass that emits whatever you give it and doesn't assume the source data is a dataframe. diff --git a/tests/_utils/stages/multi_port_pass_thru.py b/tests/_utils/stages/multi_port_pass_thru.py index 5454974870..5cffb47b2b 100644 --- a/tests/_utils/stages/multi_port_pass_thru.py +++ b/tests/_utils/stages/multi_port_pass_thru.py @@ -20,11 +20,12 @@ import mrc.core.operators as ops from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.stage import Stage -class MultiPortPassThruStage(PassThruTypeMixin, Stage): +class MultiPortPassThruStage(GpuAndCpuMixin, PassThruTypeMixin, Stage): def __init__(self, c: Config, num_ports: int): super().__init__(c) diff --git a/tests/_utils/stages/record_thread_id_stage.py b/tests/_utils/stages/record_thread_id_stage.py index d2d9a12a82..0a991c1706 100644 --- a/tests/_utils/stages/record_thread_id_stage.py +++ b/tests/_utils/stages/record_thread_id_stage.py @@ -19,11 +19,12 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class RecordThreadIdStage(PassThruTypeMixin, SinglePortStage): +class RecordThreadIdStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Forwarding stage that records the thread id of the progress engine """ diff --git a/tests/_utils/stages/split_stage.py b/tests/_utils/stages/split_stage.py index 4e816de6c0..c03db636fa 100644 --- a/tests/_utils/stages/split_stage.py +++ b/tests/_utils/stages/split_stage.py @@ -20,11 +20,12 @@ from morpheus.config import Config from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.stage import Stage from morpheus.pipeline.stage_schema import StageSchema -class SplitStage(Stage): +class SplitStage(GpuAndCpuMixin, Stage): def __init__(self, c: Config): super().__init__(c) diff --git a/tests/benchmarks/test_bench_agents_simple_pipeline.py b/tests/benchmarks/test_bench_agents_simple_pipeline.py index cbd83e3cae..ffad11dc78 100644 --- a/tests/benchmarks/test_bench_agents_simple_pipeline.py +++ b/tests/benchmarks/test_bench_agents_simple_pipeline.py @@ -97,7 +97,7 @@ def _run_pipeline(config: Config, source_dfs: list[cudf.DataFrame], model_name: @pytest.mark.usefixtures("openai", "restore_environ") -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.benchmark @mock.patch("langchain.utilities.serpapi.SerpAPIWrapper.aresults") @mock.patch("langchain.OpenAI._agenerate", autospec=True) # autospec is needed as langchain will inspect the function diff --git a/tests/benchmarks/test_bench_completion_pipeline.py b/tests/benchmarks/test_bench_completion_pipeline.py index 20f921d228..c45f3ecd9c 100644 --- a/tests/benchmarks/test_bench_completion_pipeline.py +++ b/tests/benchmarks/test_bench_completion_pipeline.py @@ -74,7 +74,7 @@ def _run_pipeline(config: Config, @pytest.mark.use_cudf -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.benchmark @pytest.mark.usefixtures("mock_nemollm", "mock_chat_completion") @pytest.mark.parametrize("llm_service_cls", [NeMoLLMService, OpenAIChatService]) diff --git a/tests/benchmarks/test_bench_rag_standalone_pipeline.py b/tests/benchmarks/test_bench_rag_standalone_pipeline.py index 8f531326a8..e394eaa331 100644 --- a/tests/benchmarks/test_bench_rag_standalone_pipeline.py +++ b/tests/benchmarks/test_bench_rag_standalone_pipeline.py @@ -121,7 +121,7 @@ def _run_pipeline(config: Config, @pytest.mark.milvus -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.use_cudf @pytest.mark.benchmark @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'llm/common/utils.py')) diff --git a/tests/benchmarks/test_bench_vdb_upload_pipeline.py b/tests/benchmarks/test_bench_vdb_upload_pipeline.py index f7864fb779..51ae9842a1 100644 --- a/tests/benchmarks/test_bench_vdb_upload_pipeline.py +++ b/tests/benchmarks/test_bench_vdb_upload_pipeline.py @@ -87,7 +87,7 @@ def _run_pipeline(config: Config, @pytest.mark.milvus -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.use_pandas @pytest.mark.benchmark @pytest.mark.import_mod([ diff --git a/tests/conftest.py b/tests/conftest.py index 55c3b03605..3dca6bc243 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,6 +37,9 @@ from _utils.kafka import kafka_consumer_fixture # noqa: F401 pylint:disable=unused-import from _utils.kafka import kafka_topics_fixture # noqa: F401 pylint:disable=unused-import +if typing.TYPE_CHECKING: + from morpheus.config import ExecutionMode + # Don't let pylint complain about pytest fixtures # pylint: disable=redefined-outer-name,unused-argument @@ -108,32 +111,11 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): supports """ - # === use_cpp Parameterize === - use_cpp = metafunc.definition.get_closest_marker("use_cpp") is not None - use_python = metafunc.definition.get_closest_marker("use_python") is not None - - use_cpp_param = pytest.param(True, marks=pytest.mark.use_cpp(added_by="generate_tests"), id="use_cpp") - use_python_param = pytest.param(False, marks=pytest.mark.use_python(added_by="generate_tests"), id="use_python") - - _set_use_cpp_params = [] - - if ("use_cpp" in metafunc.fixturenames): - # Need to add some params since the fixture was requested - - # Add cpp unless use_cpp == True and use_python == False - if not (use_python and not use_cpp): - _set_use_cpp_params.append(use_cpp_param) - - # Add python unless use_cpp == False and use_python == True - if not (not use_python and use_cpp): - _set_use_cpp_params.append(use_python_param) - - elif (use_cpp and use_python): - # Need to parameterize since we have multiple - _set_use_cpp_params.extend([use_cpp_param, use_python_param]) - - if (len(_set_use_cpp_params) > 0): - metafunc.parametrize("_set_use_cpp", _set_use_cpp_params, indirect=True) + # A test can request a fixture by placing it in the function arguments, or with a mark + if ("gpu_and_cpu_mode" in metafunc.fixturenames or metafunc.definition.get_closest_marker("gpu_and_cpu_mode")): + gpu_mode_param = pytest.param(True, marks=pytest.mark.gpu_mode(added_by="generate_tests"), id="gpu_mode") + cpu_mode_param = pytest.param(False, marks=pytest.mark.cpu_mode(added_by="generate_tests"), id="cpu_mode") + metafunc.parametrize("execution_mode", [gpu_mode_param, cpu_mode_param], indirect=True) # === df_type Parameterize === if ("df_type" in metafunc.fixturenames): @@ -172,24 +154,23 @@ def pytest_runtest_setup(item): def pytest_collection_modifyitems(session: pytest.Session, config: pytest.Config, items: typing.List[pytest.Item]): """ - To support old unittest style tests, try to determine the mark from the name + Remove tests that are incompatible with the current configuration. """ if config.getoption("--run_kafka") and not PYTEST_KAFKA_AVAIL: raise RuntimeError(f"--run_kafka requested but pytest_kafka not available due to: {PYTEST_KAFKA_ERROR}") - for item in items: - if "no_cpp" in item.nodeid and item.get_closest_marker("use_python") is None: - item.add_marker(pytest.mark.use_python(added_in="collection_modifyitems")) - elif "cpp" in item.nodeid and item.get_closest_marker("use_cpp") is None: - item.add_marker(pytest.mark.use_cpp(added_in="collection_modifyitems")) - def should_filter_test(item: pytest.Item): - use_cpp = item.get_closest_marker("use_cpp") + gpu_mode = item.get_closest_marker("gpu_mode") use_pandas = item.get_closest_marker("use_pandas") + use_cudf = item.get_closest_marker("use_cudf") + cpu_mode = item.get_closest_marker("cpu_mode") + + if (gpu_mode and use_pandas): + return False - if (use_cpp and use_pandas): + if (use_cudf and cpu_mode): return False return True @@ -212,113 +193,96 @@ def pytest_runtest_teardown(item, nextitem): reset_logging(logger_name=None) # Reset the root logger as well -# This fixture will be used by all tests. -@pytest.fixture(scope="function", autouse=True) -def _set_use_cpp(request: pytest.FixtureRequest): +@pytest.fixture(scope="function") +def df_type(request: pytest.FixtureRequest): - do_use_cpp: bool = True + df_type_str: typing.Literal["cudf", "pandas"] # Check for the param if this was indirectly set - if (hasattr(request, "param") and isinstance(request.param, bool)): - do_use_cpp = request.param + if (hasattr(request, "param")): + assert request.param in ["pandas", "cudf"], "Invalid parameter for df_type" + + df_type_str = request.param else: # If not, check for the marker and use that - use_cpp = request.node.get_closest_marker("use_cpp") is not None - use_python = request.node.get_closest_marker("use_python") is not None + use_pandas = request.node.get_closest_marker("use_pandas") is not None + use_cudf = request.node.get_closest_marker("use_cudf") is not None - if (use_cpp and use_python): - raise RuntimeError(f"Both markers (use_cpp and use_python) were added to function {request.node.nodeid}. " + if (use_pandas and use_cudf): + raise RuntimeError(f"Both markers (use_pandas and use_cudf) were added to function {request.node.nodeid}. " "Remove markers to support both.") - # This will default to True or follow use_cpp - do_use_cpp = not use_python - - from morpheus.config import CppConfig + # This will default to "cudf" or follow use_pandas + df_type_str = "cudf" if not use_pandas else "pandas" - CppConfig.set_should_use_cpp(do_use_cpp) + yield df_type_str - yield do_use_cpp +def _get_execution_mode(request: pytest.FixtureRequest) -> "ExecutionMode": + do_gpu_mode: bool = True -# This fixture will be used by all tests. -@pytest.fixture(scope="function") -def use_cpp(_set_use_cpp: bool): + # Check for the param if this was indirectly set + if (hasattr(request, "param") and isinstance(request.param, bool)): + do_gpu_mode = request.param + else: + # If not, check for the marker and use that + gpu_mode = request.node.get_closest_marker("gpu_mode") is not None + cpu_mode = request.node.get_closest_marker("cpu_mode") is not None - # Just return the set value - yield _set_use_cpp + if (gpu_mode and cpu_mode): + raise RuntimeError(f"Both markers (gpu_mode and cpu_mode) were added to function {request.node.nodeid}. " + "Use the gpu_and_cpu_mode marker to test both.") + # if both are undefined, infer based on the df_type + if (not gpu_mode and not cpu_mode): + cpu_mode = request.node.get_closest_marker("use_pandas") is not None -@pytest.fixture(scope="function") -def config_only_cpp(): - """ - Use this fixture in unittest style tests to indicate a lack of support for C++. Use via - `@pytest.mark.usefixtures("config_only_cpp")` - """ + # This will default to True or follow gpu_mode + do_gpu_mode = not cpu_mode - from morpheus.config import Config - from morpheus.config import CppConfig + from morpheus.config import ExecutionMode + if do_gpu_mode: + return ExecutionMode.GPU - CppConfig.set_should_use_cpp(True) + return ExecutionMode.CPU - yield Config() +@pytest.fixture(name="execution_mode", scope="function", autouse=True) +def execution_mode_fixture(request: pytest.FixtureRequest): + exec_mode = _get_execution_mode(request) + yield exec_mode -@pytest.fixture(scope="function") -def config_no_cpp(): - """ - Use this fixture in unittest style tests to indicate support for C++. Use via - `@pytest.mark.usefixtures("config_no_cpp")` - """ - from morpheus.config import Config +# This fixture will be used by all tests. +@pytest.fixture(scope="function", autouse=True) +def _set_use_cpp(request: pytest.FixtureRequest): + execution_mode = _get_execution_mode(request) from morpheus.config import CppConfig - CppConfig.set_should_use_cpp(False) - - yield Config() - - -@pytest.fixture(scope="function") -def df_type(request: pytest.FixtureRequest): - - df_type_str: typing.Literal["cudf", "pandas"] - - # Check for the param if this was indirectly set - if (hasattr(request, "param")): - assert request.param in ["pandas", "cudf"], "Invalid parameter for df_type" - - df_type_str = request.param - else: - # If not, check for the marker and use that - use_pandas = request.node.get_closest_marker("use_pandas") is not None - use_cudf = request.node.get_closest_marker("use_cudf") is not None - - if (use_pandas and use_cudf): - raise RuntimeError(f"Both markers (use_cpp and use_python) were added to function {request.node.nodeid}. " - "Remove markers to support both.") - - # This will default to "cudf" or follow use_pandas - df_type_str = "cudf" if not use_pandas else "pandas" + do_use_cpp: bool = (execution_mode.value == "GPU") + CppConfig.set_should_use_cpp(do_use_cpp) - yield df_type_str + yield do_use_cpp @pytest.fixture(scope="function") -def config(use_cpp: bool): +def config(execution_mode: "ExecutionMode"): """ For new pytest style tests, get the config by using this fixture. It will setup the config based on the marks set on the object. If no marks are added to the test, it will be parameterized for both C++ and python. For example, ``` - @pytest.mark.use_python + @pytest.mark.cpu_mode def my_python_test(config: Config): ... ``` """ from morpheus.config import Config + config = Config() + config.execution_mode = execution_mode - yield Config() + yield config @pytest.fixture(scope="function") @@ -902,33 +866,11 @@ def test_something(dataset: DatasetManager): ``` A test that requests this fixture will parameterize on the type of DataFrame returned by the DatasetManager. - If a test requests both this fixture and the `use_cpp` fixture, or indirectly via the `config` fixture, then - the test will parameterize over both df_type:[cudf, pandas] and use_cpp[True, False]. However it will remove the - df_type=pandas & use_cpp=True combinations as this will cause an unsupported usage of Pandas dataframes with the - C++ implementation of message classes. + If a test requests both this fixture and is marked either `gpu_mode` or `cpu_mode` then only cudf or pandas will be + used to prevent an unsupported usage of Pandas dataframes with the C++ implementation of message classes, and cuDF + with CPU-only implementations. - This behavior can also be overridden by using the `use_cudf`, `use_pandas`, `use_cpp` or `use_pandas` marks ex: - ``` - # This test will only run once with C++ enabled and cudf dataframes - @pytest.mark.use_cpp - def test something(dataset: DatasetManager): - ... - # This test will run once for each dataframe type, with C++ disabled both times - @pytest.mark.use_python - import sysdf dataframes both times - @pytest.mark.use_cudf - def test something(use_cpp: bool, dataset: DatasetManager): - ... - # This test will run only once - @pytest.mark.use_cudf - @pytest.mark.use_python - def test something(dataset: DatasetManager): - ... - # This test creates an incompatible combination and will raise a RuntimeError without being executed - @pytest.mark.use_pandas - @pytest.mark.use_cpp - def test something(dataset: DatasetManager): - ``` + Similarly the `use_cudf`, `use_pandas` marks will also prevent parametarization over the DataFrame type. Users who don't want to parametarize over the DataFrame should use the `dataset_pandas` or `dataset_cudf` fixtures. """ @@ -948,7 +890,7 @@ def dataset_pandas(): In addition to this, users can use this fixture to explicitly request a cudf Dataframe as well, allowing for a test that looks like: ``` - @pytest.mark.use_cpp + @pytest.mark.gpu_mode def test_something(dataset_pandas: DatasetManager): input_df = dataset_pandas.cudf["filter_probs.csv"] # Feed our source stage a cudf DF @@ -976,12 +918,12 @@ def test_something(dataset_cudf: DatasetManager): @pytest.fixture(scope="function") -def filter_probs_df(dataset, use_cpp: bool): +def filter_probs_df(dataset): """ Shortcut fixture for loading the filter_probs.csv dataset. Unless your test uses the `use_pandas` or `use_cudf` marks this fixture will parametarize over the two dataframe - types. Similarly unless your test uses the `use_cpp` or `use_python` marks this fixture will also parametarize over + types. Similarly unless your test uses the `gpu_mode` or `cpu_mode` marks this fixture will also parametarize over that as well, while excluding the combination of C++ execution and Pandas dataframes. """ yield dataset["filter_probs.csv"] @@ -1179,6 +1121,18 @@ def mock_nemollm_fixture(): yield mock_nemollm +@pytest.fixture(name="array_pkg") +def array_pkg_fixture(execution_mode: "ExecutionMode") -> types.ModuleType: + from morpheus.utils.type_utils import get_array_pkg + return get_array_pkg(execution_mode) + + +@pytest.fixture(name="df_pkg") +def df_pkg_fixture(execution_mode: "ExecutionMode") -> types.ModuleType: + from morpheus.utils.type_utils import get_df_pkg + return get_df_pkg(execution_mode) + + @pytest.fixture(name="mock_subscription") def mock_subscription_fixture(): """ diff --git a/tests/examples/developer_guide/test_pass_thru.py b/tests/examples/developer_guide/test_pass_thru.py index f98451f318..426b30eae2 100644 --- a/tests/examples/developer_guide/test_pass_thru.py +++ b/tests/examples/developer_guide/test_pass_thru.py @@ -19,40 +19,52 @@ import pytest from _utils import TEST_DIRS +from _utils import assert_results from morpheus.config import Config -from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta +from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage +from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage +from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage from morpheus.utils.type_aliases import DataFrameType -def _check_pass_thru(config: Config, - filter_probs_df: DataFrameType, - pass_thru_stage_cls: SinglePortStage, - on_data_fn_name: str = 'on_data'): - stage = pass_thru_stage_cls(config) - assert isinstance(stage, SinglePortStage) +def _check_pass_thru(config: Config, filter_probs_df: DataFrameType, pass_thru_stage_cls: SinglePortStage): + pass_thru_stage = pass_thru_stage_cls(config) + assert isinstance(pass_thru_stage, SinglePortStage) - meta = MessageMeta(filter_probs_df) - msg = ControlMessage() - msg.payload(meta) + pipe = LinearPipeline(config) + pipe.set_source(InMemorySourceStage(config, dataframes=[filter_probs_df.copy(deep=True)])) + sink_1 = pipe.add_stage(InMemorySinkStage(config)) + pipe.add_stage(pass_thru_stage) + sink_2 = pipe.add_stage(InMemorySinkStage(config)) + comp_stage = pipe.add_stage(CompareDataFrameStage(config, filter_probs_df.copy(deep=True))) + pipe.run() - on_data_fn = getattr(stage, on_data_fn_name) - assert on_data_fn(msg) is msg + assert_results(comp_stage.get_results()) + in_messages = sink_1.get_messages() + assert len(in_messages) == 1 + out_messages = sink_2.get_messages() + assert len(out_messages) == 1 + assert in_messages[0] is out_messages[0] + +@pytest.mark.gpu_and_cpu_mode @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'developer_guide/1_simple_python_stage/pass_thru.py')) def test_pass_thru_ex1(config: Config, filter_probs_df: DataFrameType, import_mod: types.ModuleType): pass_thru = import_mod _check_pass_thru(config, filter_probs_df, pass_thru.PassThruStage) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'developer_guide/1_simple_python_stage/pass_thru_deco.py')) def test_pass_thru_ex1_deco(config: Config, filter_probs_df: DataFrameType, import_mod: types.ModuleType): pass_thru = import_mod - _check_pass_thru(config, filter_probs_df, pass_thru.pass_thru_stage, on_data_fn_name='_on_data_fn') + _check_pass_thru(config, filter_probs_df, pass_thru.pass_thru_stage) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.import_mod( os.path.join(TEST_DIRS.examples_dir, 'developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py')) def test_pass_thru_ex3(config: Config, filter_probs_df: DataFrameType, import_mod: types.ModuleType): diff --git a/tests/examples/gnn_fraud_detection_pipeline/conftest.py b/tests/examples/gnn_fraud_detection_pipeline/conftest.py index 1ab1cc7544..e8f80e7054 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/conftest.py +++ b/tests/examples/gnn_fraud_detection_pipeline/conftest.py @@ -44,7 +44,7 @@ def cuml_fixture(fail_missing: bool): @pytest.fixture(name="config") -def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config): """ The GNN fraud detection pipeline utilizes the "other" pipeline mode. """ diff --git a/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py b/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py index c597c430ca..de0de0826e 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py +++ b/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py @@ -25,7 +25,7 @@ # pylint: disable=no-name-in-module -@pytest.mark.use_python +@pytest.mark.gpu_mode class TestClassificationStage: def test_constructor(self, config: Config, xgb_model: str, cuml: types.ModuleType): diff --git a/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py b/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py index ee278ef549..d7a8f47e8e 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py +++ b/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py @@ -28,7 +28,7 @@ # pylint: disable=no-name-in-module -@pytest.mark.use_python +@pytest.mark.gpu_mode class TestGraphConstructionStage: def test_constructor(self, config: Config, training_file: str): diff --git a/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py b/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py index f272098a7d..a4a5241a25 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py +++ b/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py @@ -25,7 +25,7 @@ # pylint: disable=no-name-in-module @pytest.mark.usefixtures("manual_seed") -@pytest.mark.use_python +@pytest.mark.gpu_mode class TestGraphSageStage: def test_constructor(self, config: Config, model_dir: str): diff --git a/tests/examples/llm/common/test_content_extractor_module.py b/tests/examples/llm/common/test_content_extractor_module.py index 2c77737681..805f9b2b1f 100644 --- a/tests/examples/llm/common/test_content_extractor_module.py +++ b/tests/examples/llm/common/test_content_extractor_module.py @@ -88,8 +88,6 @@ def generate_random_string(length: int) -> str: return ''.join(random.choices(string.ascii_letters + string.digits, k=length)) -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.parametrize("data_len, num_rows_per_file, batch_size", [(40, 5, 2), (51, 3, 1), (150, 10, 5), (500, 3, 2), (1000, 5, 3), (50, 10, 2), (100, 20, 3), (50, 5, 1), (100, 10, 1), (49, 5, 2), (99, 5, 2), (60, 7, 2), (120, 6, 3), (1000, 50, 10), diff --git a/tests/examples/llm/common/test_web_scraper_module.py b/tests/examples/llm/common/test_web_scraper_module.py index 592f5d38fb..012cb45fa3 100644 --- a/tests/examples/llm/common/test_web_scraper_module.py +++ b/tests/examples/llm/common/test_web_scraper_module.py @@ -30,8 +30,6 @@ @pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'llm/vdb_upload/module/web_scraper_module.py')) def test_web_scraper_module(config: Config, mock_rest_server: str, import_mod: types.ModuleType): url = f"{mock_rest_server}/www/index" diff --git a/tests/examples/llm/common/test_web_scraper_stage.py b/tests/examples/llm/common/test_web_scraper_stage.py index 418d245043..6526c00df1 100644 --- a/tests/examples/llm/common/test_web_scraper_stage.py +++ b/tests/examples/llm/common/test_web_scraper_stage.py @@ -28,8 +28,6 @@ @pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'llm/vdb_upload/module/web_scraper_stage.py')) def test_http_client_source_stage_pipe(config: Config, mock_rest_server: str, import_mod: types.ModuleType): url = f"{mock_rest_server}/www/index" diff --git a/tests/examples/llm/vdb_upload/test_schema_transform_module.py b/tests/examples/llm/vdb_upload/test_schema_transform_module.py index 8a4ed6e870..75dc7178a5 100644 --- a/tests/examples/llm/vdb_upload/test_schema_transform_module.py +++ b/tests/examples/llm/vdb_upload/test_schema_transform_module.py @@ -27,8 +27,6 @@ from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.parametrize("num_select, num_renames", [(1, 0), (0, 1), (1, 1), (6, 6), (13, 10), (10, 13)]) def test_schema_transform_module(num_select, num_renames, diff --git a/tests/examples/log_parsing/conftest.py b/tests/examples/log_parsing/conftest.py index f927c3fcc1..d31891873a 100644 --- a/tests/examples/log_parsing/conftest.py +++ b/tests/examples/log_parsing/conftest.py @@ -17,7 +17,7 @@ @pytest.fixture(name="config") -def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config): """ The log_parsing pipelie requires NLP mode. Set this here so all the tests don't need to set it themselves. """ diff --git a/tests/examples/log_parsing/test_inference.py b/tests/examples/log_parsing/test_inference.py index f4a7aac660..a721d8afc7 100644 --- a/tests/examples/log_parsing/test_inference.py +++ b/tests/examples/log_parsing/test_inference.py @@ -22,10 +22,10 @@ import numpy as np import pytest -import morpheus._lib.messages as _messages from _utils import TEST_DIRS from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemoryNLP from morpheus.messages import MessageMeta from morpheus.messages import TensorMemory from morpheus.stages.inference.triton_inference_stage import TritonInferenceWorker @@ -52,13 +52,13 @@ def build_resp_message(df: DataFrameType, num_cols: int = 2) -> ControlMessage: seq_ids[:, 2] = 42 meta = MessageMeta(df) - mem = _messages.TensorMemory(count=count, - tensors={ - 'confidences': cp.zeros((count, num_cols)), - 'labels': cp.zeros((count, num_cols)), - 'input_ids': cp.zeros((count, num_cols), dtype=cp.float32), - 'seq_ids': seq_ids - }) + mem = TensorMemory(count=count, + tensors={ + 'confidences': cp.zeros((count, num_cols)), + 'labels': cp.zeros((count, num_cols)), + 'input_ids': cp.zeros((count, num_cols), dtype=cp.float32), + 'seq_ids': seq_ids + }) cm = ControlMessage() cm.payload(meta) cm.tensors(mem) @@ -78,10 +78,10 @@ def build_inf_message(df: DataFrameType, mess_count: int, count: int, num_cols: seq_ids[:, 2] = 42 meta = MessageMeta(df) - mem = _messages.InferenceMemoryNLP(count=tensor_length, - input_ids=cp.zeros((tensor_length, num_cols), dtype=cp.float32), - input_mask=cp.zeros((tensor_length, num_cols), dtype=cp.float32), - seq_ids=seq_ids) + mem = InferenceMemoryNLP(count=tensor_length, + input_ids=cp.zeros((tensor_length, num_cols), dtype=cp.float32), + input_mask=cp.zeros((tensor_length, num_cols), dtype=cp.float32), + seq_ids=seq_ids) cm = ControlMessage() cm.payload(meta) cm.tensors(mem) diff --git a/tests/examples/log_parsing/test_postprocessing.py b/tests/examples/log_parsing/test_postprocessing.py index 48baeaddc1..e6271d8a42 100644 --- a/tests/examples/log_parsing/test_postprocessing.py +++ b/tests/examples/log_parsing/test_postprocessing.py @@ -23,12 +23,12 @@ import numpy as np import pytest -import morpheus._lib.messages as _messages from _utils import TEST_DIRS from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory @pytest.fixture(scope='module', name="model_config_file") @@ -55,7 +55,7 @@ def build_post_proc_message(dataset_cudf: DatasetManager, log_test_data_dir: str seq_ids[:, 2] = cp.asarray(host__seq_data)[:, 2] tensors['seq_ids'] = seq_ids - memory = _messages.TensorMemory(count=5, tensors=tensors) + memory = TensorMemory(count=5, tensors=tensors) msg = ControlMessage() msg.payload(meta) diff --git a/tests/examples/ransomware_detection/conftest.py b/tests/examples/ransomware_detection/conftest.py index 7c3ca3e74e..9beffab06a 100644 --- a/tests/examples/ransomware_detection/conftest.py +++ b/tests/examples/ransomware_detection/conftest.py @@ -38,7 +38,7 @@ def dask_distributed_fixture(dask_distributed): @pytest.fixture(name="config") -def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config): """ The ransomware detection pipeline utilizes the FIL pipeline mode. """ diff --git a/tests/examples/ransomware_detection/test_create_features.py b/tests/examples/ransomware_detection/test_create_features.py index 29c06efdc1..9f5ffa9218 100644 --- a/tests/examples/ransomware_detection/test_create_features.py +++ b/tests/examples/ransomware_detection/test_create_features.py @@ -19,18 +19,15 @@ import typing from unittest import mock -import pytest +import pandas as pd from _utils import TEST_DIRS from _utils.dataset_manager import DatasetManager from morpheus.config import Config -from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import AppShieldMessageMeta from morpheus.pipeline.control_message_stage import ControlMessageStage from morpheus.stages.input.appshield_source_stage import AppShieldSourceStage -@pytest.mark.use_python class TestCreateFeaturesRWStage: # pylint: disable=no-name-in-module @@ -92,18 +89,25 @@ def test_on_next(self, mock_dask_client.submit.return_value = mock_dask_future input_glob = os.path.join(TEST_DIRS.tests_data_dir, 'appshield', 'snapshot-1', '*.json') - input_data = AppShieldSourceStage.files_to_dfs(glob.glob(input_glob), - cols_include=rwd_conf['raw_columns'], - cols_exclude=["SHA256"], - plugins_include=interested_plugins, - encoding='latin1') + appshield_source_stage = AppShieldSourceStage(config, + input_glob, + plugins_include=interested_plugins, + cols_include=rwd_conf['raw_columns'], + cols_exclude=["SHA256"], + encoding='latin1') - input_metas = AppShieldSourceStage._build_metadata(input_data) + input_data = appshield_source_stage.files_to_dfs(glob.glob(input_glob), + cols_include=rwd_conf['raw_columns'], + cols_exclude=["SHA256"], + plugins_include=interested_plugins, + encoding='latin1') + + input_messages = appshield_source_stage._build_messages(input_data) # Make sure the input test date looks the way we expect it - assert len(input_metas) == 1 - input_meta = input_metas[0] - assert input_meta.source == 'appshield' + assert len(input_messages) == 1 + input_message = input_messages[0] + assert input_message.get_metadata('source') == 'appshield' stage = CreateFeaturesRWStage(config, interested_plugins=interested_plugins, @@ -115,71 +119,24 @@ def test_on_next(self, # make sure we have a mocked dask client assert stage._client is mock_dask_client - meta = stage.on_next(input_meta) - assert isinstance(meta, AppShieldMessageMeta) - assert meta.source == input_meta.source + messages = stage.on_next(input_message) + + dataframes = [] + for message in messages: + assert message.get_metadata('source') == input_message.get_metadata('source') + dataframes.append(message.payload().copy_dataframe().to_pandas()) + + actual_df = pd.concat(dataframes, ignore_index=True) + actual_df.sort_values(by=["pid_process", "snapshot_id"], inplace=True) + actual_df.reset_index(drop=True, inplace=True) expected_df = dataset_pandas[os.path.join(test_data_dir, 'dask_results.csv')] expected_df['source_pid_process'] = 'appshield_' + expected_df.pid_process + expected_df['ldrmodules_df_path'] = expected_df['ldrmodules_df_path'].astype(str) # convert to string expected_df.sort_values(by=["pid_process", "snapshot_id"], inplace=True) expected_df.reset_index(drop=True, inplace=True) - dataset_pandas.assert_compare_df(meta.copy_dataframe(), expected_df) - - @mock.patch('stages.create_features.Client') - def test_create_control_messages(self, - mock_dask_client, - config: Config, - rwd_conf: dict, - interested_plugins: typing.List[str], - dataset_pandas: DatasetManager): - from stages.create_features import CreateFeaturesRWStage - mock_dask_client.return_value = mock_dask_client - - pids = [75956, 118469, 1348612, 2698363, 2721362, 2788672] - df = dataset_pandas["filter_probs.csv"] - df['pid_process'] = [ - 2788672, - 75956, - 75956, - 2788672, - 2788672, - 2698363, - 2721362, - 118469, - 1348612, - 2698363, - 118469, - 2698363, - 1348612, - 118469, - 75956, - 2721362, - 75956, - 118469, - 118469, - 118469 - ] - df = df.sort_values(by=["pid_process"]).reset_index(drop=True) - - stage = CreateFeaturesRWStage(config, - interested_plugins=interested_plugins, - feature_columns=rwd_conf['model_features'], - file_extns=rwd_conf['file_extensions'], - n_workers=5, - threads_per_worker=6) - - meta = AppShieldMessageMeta(df, source='tests') - control_messages = stage.create_control_messages(meta) - assert len(control_messages) == len(pids) - - prev_loc = 0 - for (i, _control_message) in enumerate(control_messages): - assert isinstance(_control_message, ControlMessage) - pid = pids[i] - (_control_message.payload().get_data(['pid_process']) == pid).all() - prev_loc = prev_loc + _control_message.payload().count - assert prev_loc == len(df) + dataset_pandas.assert_compare_df(actual_df, expected_df) @mock.patch('stages.create_features.Client') def test_on_completed(self, mock_dask_client, config: Config, rwd_conf: dict, interested_plugins: typing.List[str]): diff --git a/tests/examples/ransomware_detection/test_preprocessing.py b/tests/examples/ransomware_detection/test_preprocessing.py index ad9d3b74eb..9d1f8e81ef 100644 --- a/tests/examples/ransomware_detection/test_preprocessing.py +++ b/tests/examples/ransomware_detection/test_preprocessing.py @@ -20,11 +20,10 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import AppShieldMessageMeta +from morpheus.messages import MessageMeta from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage -@pytest.mark.use_python class TestPreprocessingRWStage: # pylint: disable=no-name-in-module @@ -147,22 +146,19 @@ def test_merge_curr_and_prev_snapshots(self, config: Config, rwd_conf: dict, dat stage._merge_curr_and_prev_snapshots(df, source_pid_process) dataset_pandas.assert_compare_df(df.fillna(''), expected_df) - def test_pre_process_batch(self, config: Config, rwd_conf: dict, dataset_pandas: DatasetManager): - - # Pylint currently fails to work with classmethod: https://github.com/pylint-dev/pylint/issues/981 - # pylint: disable=no-member - + def test_pre_process_batch(self, config: Config, rwd_conf: dict, dataset_cudf: DatasetManager): from stages.preprocessing import PreprocessingRWStage - df = dataset_pandas['examples/ransomware_detection/dask_results.csv'] + df = dataset_cudf['examples/ransomware_detection/dask_results.csv'] df['source_pid_process'] = 'appshield_' + df.pid_process expected_df = df.copy(deep=True).fillna('') - meta = AppShieldMessageMeta(df=df, source='tests') - control_msg = ControlMessage() - control_msg.payload(meta) + meta = MessageMeta(df) + cm = ControlMessage() + cm.payload(meta) + cm.set_metadata('source', 'tests') sliding_window = 4 stage = PreprocessingRWStage(config, feature_columns=rwd_conf['model_features'], sliding_window=sliding_window) - results: ControlMessage = stage._pre_process_batch(control_msg) + results: ControlMessage = stage._pre_process_batch(cm) assert isinstance(results, ControlMessage) expected_df['sequence'] = ['dummy' for _ in range(len(expected_df))] @@ -171,6 +167,9 @@ def test_pre_process_batch(self, config: Config, rwd_conf: dict, dataset_pandas: expected_seq_ids[:, 0] = cp.arange(0, len(expected_df), dtype=cp.uint32) expected_seq_ids[:, 2] = len(rwd_conf['model_features']) * 3 - dataset_pandas.assert_compare_df(results.payload().get_data().fillna(''), expected_df) - assert (results.tensors().get_tensor('input__0') == expected_input__0).all() - assert (results.tensors().get_tensor('seq_ids') == expected_seq_ids).all() + actual_df = results.payload().copy_dataframe().to_pandas().fillna('') + dataset_cudf.assert_compare_df(actual_df, expected_df) + + actual_tensors = results.tensors() + assert (actual_tensors.get_tensor('input__0') == expected_input__0).all() + assert (actual_tensors.get_tensor('seq_ids') == expected_seq_ids).all() diff --git a/tests/morpheus/apps/test_abp.py b/tests/morpheus/apps/test_abp.py index 17d23f248a..f90f9e1eef 100755 --- a/tests/morpheus/apps/test_abp.py +++ b/tests/morpheus/apps/test_abp.py @@ -15,15 +15,12 @@ # limitations under the License. import os -from unittest import mock -import numpy as np import pytest from _utils import TEST_DIRS from _utils import calc_error_val from _utils import compare_class_to_scores -from _utils import mk_async_infer from morpheus.config import Config from morpheus.config import ConfigFIL from morpheus.config import PipelineModes @@ -48,71 +45,7 @@ @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_no_cpp(mock_triton_client: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - 'name': 'output__0', 'datatype': 'FP32', 'shape': ['-1', '1'] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_abp_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.FIL - config.class_labels = ["mining"] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.fil = ConfigFIL() - config.fil.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_fil.txt')) - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'abp-validation-data.jsonlines') - out_file = os.path.join(tmp_path, 'results.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(PreprocessFILStage(config)) - pipe.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config)) - pipe.add_stage(AddScoresStage(config, prefix="score_")) - pipe.add_stage( - ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - compare_class_to_scores(out_file, config.class_labels, '', 'score_', threshold=0.5) - results = calc_error_val(results_file_name) - assert results.diff_rows == 0 - - -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): config.mode = PipelineModes.FIL @@ -161,90 +94,7 @@ def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_multi_segment_no_cpp(mock_triton_client: mock.MagicMock, - config: Config, - tmp_path: str, - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - 'name': 'output__0', 'datatype': 'FP32', 'shape': ['-1', '1'] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_abp_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.FIL - config.class_labels = ["mining"] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.fil = ConfigFIL() - config.fil.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_fil.txt')) - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'abp-validation-data.jsonlines') - out_file = os.path.join(tmp_path, 'results.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 1 - - pipe.add_stage(PreprocessFILStage(config)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 2 - - pipe.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 3 - - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 4 - - pipe.add_stage( - ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 5 - - pipe.add_stage(SerializeStage(config)) - - pipe.add_segment_boundary(MessageMeta) # Boundary 6 - - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - results = calc_error_val(results_file_name) - assert results.diff_rows == 0 - - -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_abp_multi_segment_cpp(config, tmp_path): diff --git a/tests/morpheus/apps/test_abp_kafka.py b/tests/morpheus/apps/test_abp_kafka.py index 46306ff29c..7c241b8a5d 100755 --- a/tests/morpheus/apps/test_abp_kafka.py +++ b/tests/morpheus/apps/test_abp_kafka.py @@ -17,14 +17,11 @@ import os import typing from io import StringIO -from unittest import mock -import numpy as np import pandas import pytest from _utils import TEST_DIRS -from _utils import mk_async_infer from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from _utils.kafka import write_file_to_kafka @@ -54,100 +51,7 @@ @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_no_cpp(mock_triton_client: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - 'name': 'output__0', 'datatype': 'FP32', 'shape': ['-1', '1'] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_abp_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.FIL - config.class_labels = ["mining"] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.fil = ConfigFIL() - config.fil.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_fil.txt')) - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'abp-validation-data.jsonlines') - - # Fill our topic with the input data - num_records = write_file_to_kafka(kafka_bootstrap_servers, kafka_topics.input_topic, val_file_name) - - pipe = LinearPipeline(config) - pipe.set_source( - KafkaSourceStage(config, - bootstrap_servers=kafka_bootstrap_servers, - input_topic=kafka_topics.input_topic, - auto_offset_reset="earliest", - poll_interval="1seconds", - stop_after=num_records, - client_id="test_abp_no_cpp_reader")) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(PreprocessFILStage(config)) - pipe.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage( - WriteToKafkaStage(config, - bootstrap_servers=kafka_bootstrap_servers, - output_topic=kafka_topics.output_topic, - client_id="test_abp_no_cpp_writer")) - - pipe.run() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f'{rec.value.decode("utf-8")}\n') - - output_buf.seek(0) - output_df = pandas.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == num_records - - results = compare_df(val_df, output_df, exclude_columns=[r'^ID$', r'^_ts_'], rel_tol=0.05) - - assert results['diff_rows'] == 0 - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_abp_cpp(config: Config, dataset_pandas: DatasetManager, diff --git a/tests/morpheus/apps/test_phishing.py b/tests/morpheus/apps/test_phishing.py index 77e752ef3f..41f3d9c7c2 100755 --- a/tests/morpheus/apps/test_phishing.py +++ b/tests/morpheus/apps/test_phishing.py @@ -15,14 +15,11 @@ # limitations under the License. import os -from unittest import mock -import numpy as np import pytest from _utils import TEST_DIRS from _utils import calc_error_val -from _utils import mk_async_infer from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.pipeline import LinearPipeline @@ -43,75 +40,7 @@ @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_email_no_cpp(mock_triton_client: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - "name": "input_ids", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }, { - "name": "attention_mask", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - "name": "output", "datatype": "FP32", "shape": [-1, 2] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_phishing_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.NLP - config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'phishing-email-validation-data.jsonlines') - vocab_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-uncased-hash.txt') - out_file = os.path.join(tmp_path, 'results.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage( - PreprocessNLPStage(config, - vocab_hash_file=vocab_file_name, - truncation=True, - do_lower_case=True, - add_special_tokens=False)) - pipe.add_stage( - TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', - force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) - pipe.add_stage( - ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - results = calc_error_val(results_file_name) - assert results.diff_rows == 153 - - -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_email_cpp(config: Config, tmp_path: str, morpheus_log_level: int): config.mode = PipelineModes.NLP diff --git a/tests/morpheus/apps/test_phishing_kafka.py b/tests/morpheus/apps/test_phishing_kafka.py index 1a04061cc9..3524cc62f4 100755 --- a/tests/morpheus/apps/test_phishing_kafka.py +++ b/tests/morpheus/apps/test_phishing_kafka.py @@ -17,14 +17,11 @@ import os import typing from io import StringIO -from unittest import mock -import numpy as np import pandas import pytest from _utils import TEST_DIRS -from _utils import mk_async_infer from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from _utils.kafka import write_file_to_kafka @@ -53,103 +50,7 @@ @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_email_no_cpp(mock_triton_client: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - "name": "input_ids", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }, { - "name": "attention_mask", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - "name": "output", "datatype": "FP32", "shape": [-1, 2] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_phishing_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.NLP - config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'phishing-email-validation-data.jsonlines') - vocab_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-uncased-hash.txt') - - num_records = write_file_to_kafka(kafka_bootstrap_servers, kafka_topics.input_topic, val_file_name) - - # Disabling commits due to known issue in Python impl: https://github.com/nv-morpheus/Morpheus/issues/294 - pipe = LinearPipeline(config) - pipe.set_source( - KafkaSourceStage(config, - bootstrap_servers=kafka_bootstrap_servers, - input_topic=kafka_topics.input_topic, - auto_offset_reset="earliest", - poll_interval="1seconds", - disable_commit=True, - stop_after=num_records)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage( - PreprocessNLPStage(config, - vocab_hash_file=vocab_file_name, - truncation=True, - do_lower_case=True, - add_special_tokens=False)) - pipe.add_stage( - TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', - force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage( - WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) - - pipe.run() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f"{rec.value.decode('utf-8')}\n") - - output_buf.seek(0) - output_df = pandas.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == num_records - - results = compare_df(val_df, output_df, exclude_columns=[r'^ID$', r'^_ts_'], rel_tol=0.05) - - assert results['diff_rows'] == 153 - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_email_cpp(dataset_pandas: DatasetManager, config: Config, diff --git a/tests/morpheus/apps/test_sid.py b/tests/morpheus/apps/test_sid.py index 304d6a5f04..4fb5616b82 100755 --- a/tests/morpheus/apps/test_sid.py +++ b/tests/morpheus/apps/test_sid.py @@ -169,7 +169,7 @@ def _run_minibert(*, @pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_minibert_no_trunc(config: Config, tmp_path: str, morpheus_log_level: int): diff --git a/tests/morpheus/apps/test_sid_kafka.py b/tests/morpheus/apps/test_sid_kafka.py index eb70a98fc9..5d85188d6f 100755 --- a/tests/morpheus/apps/test_sid_kafka.py +++ b/tests/morpheus/apps/test_sid_kafka.py @@ -17,14 +17,11 @@ import os import typing from io import StringIO -from unittest import mock -import numpy as np import pandas import pytest from _utils import TEST_DIRS -from _utils import mk_async_infer from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from morpheus.config import Config @@ -51,102 +48,7 @@ @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_minibert_no_cpp(mock_triton_client: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - "name": "input_ids", "datatype": "INT32", "shape": [-1, FEATURE_LENGTH] - }, { - "name": "attention_mask", "datatype": "INT32", "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - "name": "output", "datatype": "FP32", "shape": [-1, 10] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_sid_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.NLP - config.class_labels = [ - "address", - "bank_acct", - "credit_card", - "email", - "govt_id", - "name", - "password", - "phone_num", - "secret_keys", - "user" - ] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'sid-validation-data.csv') - vocab_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-uncased-hash.txt') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage( - PreprocessNLPStage(config, - vocab_hash_file=vocab_file_name, - truncation=True, - do_lower_case=True, - add_special_tokens=False)) - pipe.add_stage( - TritonInferenceStage(config, model_name='sid-minibert-onnx', server_url='fake:001', force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config, threshold=0.5, prefix="si_")) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage( - WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) - - pipe.run() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f"{rec.value.decode('utf-8')}\n") - - output_buf.seek(0) - output_df = pandas.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == len(val_df) - - results = compare_df(val_df, output_df, exclude_columns=[r'^ID$', r'^_ts_'], rel_tol=0.05) - - assert results['diff_rows'] == 1333 - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_minibert_cpp(dataset_pandas: DatasetManager, config: Config, diff --git a/tests/morpheus/controllers/test_elasticsearch_controller.py b/tests/morpheus/controllers/test_elasticsearch_controller.py index 903e4bf14f..3a136b0cd8 100644 --- a/tests/morpheus/controllers/test_elasticsearch_controller.py +++ b/tests/morpheus/controllers/test_elasticsearch_controller.py @@ -48,14 +48,12 @@ def inner_create_controller(*, connection_kwargs=connection_kwargs, refresh_peri yield inner_create_controller -@pytest.mark.use_python def test_constructor(create_controller: typing.Callable[..., ElasticsearchController], connection_kwargs: dict): assert create_controller(raise_on_exception=True)._raise_on_exception is True assert create_controller(refresh_period_secs=1.5)._refresh_period_secs == 1.5 assert create_controller()._connection_kwargs == connection_kwargs -@pytest.mark.use_python def test_refresh_client_force(create_controller: typing.Callable[..., ElasticsearchController]): controller = create_controller(refresh_period_secs=1) @@ -68,7 +66,6 @@ def test_refresh_client_force(create_controller: typing.Callable[..., Elasticsea assert controller._last_refresh_time > 0 -@pytest.mark.use_python def test_refresh_client_not_needed(create_controller: typing.Callable[..., ElasticsearchController]): controller = create_controller() client = controller._client @@ -81,7 +78,6 @@ def test_refresh_client_not_needed(create_controller: typing.Callable[..., Elast assert is_refreshed is False -@pytest.mark.use_python def test_refresh_client_needed(create_controller: typing.Callable[..., ElasticsearchController]): # Set a 1 second refresh period @@ -98,7 +94,6 @@ def test_refresh_client_needed(create_controller: typing.Callable[..., Elasticse assert is_refreshed is True -@pytest.mark.use_python @patch("morpheus.controllers.elasticsearch_controller.parallel_bulk", return_value=[(True, None)]) def test_parallel_bulk_write(mock_parallel_bulk, create_controller: typing.Callable[..., ElasticsearchController]): # Define your mock actions @@ -108,7 +103,6 @@ def test_parallel_bulk_write(mock_parallel_bulk, create_controller: typing.Calla mock_parallel_bulk.assert_called_once() -@pytest.mark.use_python @patch("morpheus.controllers.elasticsearch_controller.parallel_bulk", return_value=[(True, None)]) def test_df_to_parallel_bulk_write(mock_parallel_bulk: typing.Callable, create_controller: typing.Callable[..., ElasticsearchController]): diff --git a/tests/morpheus/dfencoder/test_autoencoder.py b/tests/morpheus/dfencoder/test_autoencoder.py index bd02907f92..be11cb4cf8 100755 --- a/tests/morpheus/dfencoder/test_autoencoder.py +++ b/tests/morpheus/dfencoder/test_autoencoder.py @@ -37,7 +37,7 @@ from morpheus.models.dfencoder.dataloader import FileSystemDataset # Only pandas and Python is supported -pytestmark = [pytest.mark.use_pandas, pytest.mark.use_python] +pytestmark = [pytest.mark.use_pandas, pytest.mark.cpu_mode] BIN_COLS = ['ts_anomaly'] diff --git a/tests/morpheus/dfencoder/test_pkg.py b/tests/morpheus/dfencoder/test_pkg.py deleted file mode 100755 index 3b5d39585c..0000000000 --- a/tests/morpheus/dfencoder/test_pkg.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - - -@pytest.mark.skip -def test_old_dfencoder_not_in_env(): - """ - Verify the old external dfencoder doesn't exist in the current env - """ - with pytest.raises(ModuleNotFoundError): - import dfencoder # noqa: F401 #pylint:disable=unused-import diff --git a/tests/morpheus/io/test_io_utils.py b/tests/morpheus/io/test_io_utils.py index 1ad46b75cb..3c3e241ce8 100755 --- a/tests/morpheus/io/test_io_utils.py +++ b/tests/morpheus/io/test_io_utils.py @@ -14,14 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing from collections.abc import Callable +import pandas as pd import pytest import cudf from _utils.dataset_manager import DatasetManager +from morpheus.config import ExecutionMode from morpheus.io import utils as io_utils +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType MULTI_BYTE_STRINGS = ["ñäμɛ", "Moρφέας", "taç"] @@ -132,3 +136,15 @@ def test_truncate_string_cols_by_bytes(dataset: DatasetManager, assert isinstance(df, expected_df_class) dataset.assert_df_equal(df, expected_df) + + +@pytest.mark.parametrize("mode, expected", + [(ExecutionMode.GPU, cudf.read_json), (ExecutionMode.CPU, pd.read_json), + ("cudf", cudf.read_json), ("pandas", pd.read_json)]) +def test_get_json_reader(mode: typing.Union[ExecutionMode, DataFrameModule], expected: Callable[..., DataFrameType]): + reader = io_utils.get_json_reader(mode) + if hasattr(reader, "func"): + # Unwrap partial + reader = reader.func + + assert reader is expected diff --git a/tests/morpheus/messages/test_control_message.py b/tests/morpheus/messages/test_control_message.py index 85f2aa344f..b9ba42d079 100644 --- a/tests/morpheus/messages/test_control_message.py +++ b/tests/morpheus/messages/test_control_message.py @@ -18,24 +18,25 @@ import io import sys -import cupy as cp import pytest from _utils.dataset_manager import DatasetManager from morpheus import messages +from morpheus.config import Config from morpheus.messages import TensorMemory +from morpheus.utils.type_utils import get_array_pkg # pylint: disable=unsupported-membership-test # pylint: disable=unsubscriptable-object -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_init(): messages.ControlMessage() # noqa: F841 messages.ControlMessage({"test": "test"}) # noqa: F841 -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_tasks(): message = messages.ControlMessage() assert len(message.get_tasks()) == 0 @@ -70,12 +71,6 @@ def test_control_message_tasks(): assert message.get_tasks()["type_a"][0]["key_x"] == "value_x" assert message.get_tasks()["type_a"][1]["key_y"] == "value_y" - # Ensure the underlying tasks cannot are not modified - message = messages.ControlMessage() - tasks = message.get_tasks() - tasks["type_a"] = [{"key_x", "value_x"}] # pylint: disable=unsupported-assignment-operation - assert len(message.get_tasks()) == 0 - message = messages.ControlMessage() message.add_task("type_a", {"key_x": "value_x"}) message.add_task("type_a", {"key_y": "value_y"}) @@ -86,7 +81,7 @@ def test_control_message_tasks(): assert message.get_tasks()["type_a"][1]["key_y"] == "value_y" -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_metadata(): message = messages.ControlMessage() @@ -108,11 +103,8 @@ def test_control_message_metadata(): assert message.get_metadata()["key_y"] == "value_yy" - message.get_metadata()["not_mutable"] = 5 # pylint: disable=unsupported-assignment-operation - - assert "not_mutable" not in message.get_metadata() - +@pytest.mark.gpu_and_cpu_mode def test_set_and_get_metadata(): message = messages.ControlMessage() @@ -132,6 +124,7 @@ def test_set_and_get_metadata(): assert all_metadata["another_key"] == "another_value" +@pytest.mark.gpu_and_cpu_mode def test_list_metadata(): message = messages.ControlMessage() @@ -146,6 +139,7 @@ def test_list_metadata(): assert set(keys) == {"key1", "key2", "key3"} +@pytest.mark.gpu_and_cpu_mode def test_get_metadata_default_value(): message = messages.ControlMessage() @@ -159,7 +153,7 @@ def test_get_metadata_default_value(): assert message.get_metadata("non_existing_key", "default_value") == "default_value" -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_get(): raw_control_message = messages.ControlMessage({ "test": "test_rcm", "tasks": [{ @@ -183,7 +177,7 @@ def test_control_message_get(): assert (control_message.has_task("load")) -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_set(): raw_control_message = messages.ControlMessage() control_message = messages.ControlMessage() @@ -204,6 +198,7 @@ def test_control_message_set(): assert (control_message.has_task("load")) +@pytest.mark.gpu_and_cpu_mode def test_control_message_set_and_get_payload(dataset: DatasetManager): df = dataset["test_dataframe.jsonlines"] @@ -217,7 +212,7 @@ def test_control_message_set_and_get_payload(dataset: DatasetManager): DatasetManager.assert_df_equal(payload.df, payload2.df) -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_set_and_get_timestamp_single(): # Create a ControlMessage instance msg = messages.ControlMessage() @@ -234,7 +229,7 @@ def test_set_and_get_timestamp_single(): assert result == timestamp, "The retrieved timestamp should match the one that was set." -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_filter_timestamp(): # Create a ControlMessage instance msg = messages.ControlMessage() @@ -255,7 +250,7 @@ def test_filter_timestamp(): assert result[f"{group}::key2"] == timestamp2, "The timestamp for key2 should match." -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_modetest_tensor_manipulation_after_retrieval def test_get_timestamp_fail_if_nonexist(): # Create a ControlMessage instance msg = messages.ControlMessage() @@ -269,10 +264,15 @@ def test_get_timestamp_fail_if_nonexist(): assert str(exc_info.value) == "Timestamp for the specified key does not exist." -# Test setting and getting tensors with cupy arrays -@pytest.mark.usefixtures("config_only_cpp") -def test_tensors_setting_and_getting(): - data = {"input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1])} +@pytest.mark.gpu_and_cpu_mode +def test_tensors_setting_and_getting(config: Config): + # Test setting and getting tensors with cupy/numpy arrays + array_pkg = get_array_pkg(config.execution_mode) + data = { + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) + } message = messages.ControlMessage() tensor_memory = TensorMemory(count=data["input_ids"].shape[0]) tensor_memory.set_tensors(data) @@ -283,14 +283,17 @@ def test_tensors_setting_and_getting(): assert retrieved_tensors.count == data["input_ids"].shape[0], "Tensor count mismatch." for key, val in data.items(): - assert cp.allclose(retrieved_tensors.get_tensor(key), val), f"Mismatch in tensor data for {key}." + assert array_pkg.allclose(retrieved_tensors.get_tensor(key), val), f"Mismatch in tensor data for {key}." -# Test retrieving tensor names and checking specific tensor existence -@pytest.mark.usefixtures("config_only_cpp") -def test_tensor_names_and_existence(): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_names_and_existence(config: Config): + # Test retrieving tensor names and checking specific tensor existence + array_pkg = get_array_pkg(config.execution_mode) tokenized_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } message = messages.ControlMessage() tensor_memory = TensorMemory(count=tokenized_data["input_ids"].shape[0], tensors=tokenized_data) @@ -303,11 +306,14 @@ def test_tensor_names_and_existence(): assert retrieved_tensors.has_tensor(key), f"Tensor {key} should exist." -# Test manipulating tensors after retrieval -@pytest.mark.usefixtures("config_only_cpp") -def test_tensor_manipulation_after_retrieval(): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_manipulation_after_retrieval(config: Config): + # Test manipulating tensors after retrieval + array_pkg = get_array_pkg(config.execution_mode) tokenized_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=tokenized_data) @@ -315,17 +321,20 @@ def test_tensor_manipulation_after_retrieval(): message.tensors(tensor_memory) retrieved_tensors = message.tensors() - new_tensor = cp.array([4, 5, 6]) + new_tensor = array_pkg.array([4, 5, 6]) retrieved_tensors.set_tensor("new_tensor", new_tensor) - assert cp.allclose(retrieved_tensors.get_tensor("new_tensor"), new_tensor), "New tensor data mismatch." + assert array_pkg.allclose(retrieved_tensors.get_tensor("new_tensor"), new_tensor), "New tensor data mismatch." -# Assuming there's functionality to update all tensors at once -@pytest.mark.usefixtures("config_only_cpp") -def test_tensor_update(): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_update(config: Config): + # Assuming there's functionality to update all tensors at once + array_pkg = get_array_pkg(config.execution_mode) tokenized_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=tokenized_data) @@ -334,7 +343,9 @@ def test_tensor_update(): # Update tensors with new data new_tensors = { - "input_ids": cp.array([4, 5, 6]), "input_mask": cp.array([1, 0, 1]), "segment_ids": cp.array([1, 1, 0]) + "input_ids": array_pkg.array([4, 5, 6]), + "input_mask": array_pkg.array([1, 0, 1]), + "segment_ids": array_pkg.array([1, 1, 0]) } tensor_memory.set_tensors(new_tensors) @@ -342,13 +353,14 @@ def test_tensor_update(): updated_tensors = message.tensors() for key, val in new_tensors.items(): - assert cp.allclose(updated_tensors.get_tensor(key), val), f"Mismatch in updated tensor data for {key}." + assert array_pkg.allclose(updated_tensors.get_tensor(key), val), f"Mismatch in updated tensor data for {key}." -@pytest.mark.usefixtures("config_only_cpp") -def test_update_individual_tensor(): - initial_data = {"input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1])} - update_data = {"input_ids": cp.array([4, 5, 6])} +@pytest.mark.gpu_and_cpu_mode +def test_update_individual_tensor(config: Config): + array_pkg = get_array_pkg(config.execution_mode) + initial_data = {"input_ids": array_pkg.array([1, 2, 3]), "input_mask": array_pkg.array([1, 1, 1])} + update_data = {"input_ids": array_pkg.array([4, 5, 6])} message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=initial_data) message.tensors(tensor_memory) @@ -358,14 +370,14 @@ def test_update_individual_tensor(): retrieved_tensors = message.tensors() # Check updated tensor - assert cp.allclose(retrieved_tensors.get_tensor("input_ids"), - update_data["input_ids"]), "Input IDs update mismatch." + assert array_pkg.allclose(retrieved_tensors.get_tensor("input_ids"), + update_data["input_ids"]), "Input IDs update mismatch." # Ensure other tensor remains unchanged - assert cp.allclose(retrieved_tensors.get_tensor("input_mask"), - initial_data["input_mask"]), "Input mask should remain unchanged after updating input_ids." + assert array_pkg.allclose(retrieved_tensors.get_tensor("input_mask"), + initial_data["input_mask"]), "input_mask should be unchanged after updating input_ids." -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_behavior_with_empty_tensors(): message = messages.ControlMessage() tensor_memory = TensorMemory(count=0) @@ -376,26 +388,27 @@ def test_behavior_with_empty_tensors(): assert len(retrieved_tensors.tensor_names) == 0, "There should be no tensor names for empty tensor memory." -@pytest.mark.usefixtures("config_only_cpp") -def test_consistency_after_multiple_operations(): - initial_data = {"input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1])} +@pytest.mark.gpu_and_cpu_mode +def test_consistency_after_multiple_operations(config: Config): + array_pkg = get_array_pkg(config.execution_mode) + initial_data = {"input_ids": array_pkg.array([1, 2, 3]), "input_mask": array_pkg.array([1, 1, 1])} message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=initial_data) message.tensors(tensor_memory) # Update a tensor - tensor_memory.set_tensor("input_ids", cp.array([4, 5, 6])) + tensor_memory.set_tensor("input_ids", array_pkg.array([4, 5, 6])) # Remove another tensor # Add a new tensor - new_tensor = {"new_tensor": cp.array([7, 8, 9])} + new_tensor = {"new_tensor": array_pkg.array([7, 8, 9])} tensor_memory.set_tensor("new_tensor", new_tensor["new_tensor"]) retrieved_tensors = message.tensors() assert retrieved_tensors.count == 3, "Tensor count mismatch after multiple operations." - assert cp.allclose(retrieved_tensors.get_tensor("input_ids"), - cp.array([4, 5, 6])), "Mismatch in input_ids after update." - assert cp.allclose(retrieved_tensors.get_tensor("new_tensor"), - new_tensor["new_tensor"]), "New tensor data mismatch." + assert array_pkg.allclose(retrieved_tensors.get_tensor("input_ids"), + array_pkg.array([4, 5, 6])), "Mismatch in input_ids after update." + assert array_pkg.allclose(retrieved_tensors.get_tensor("new_tensor"), + new_tensor["new_tensor"]), "New tensor data mismatch." class NonSerializablePyClass(): @@ -428,7 +441,7 @@ def fixture_pyobject(request): return request.param() -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_mode def test_metadata_holds_non_serializable_python_obj(py_object): message = messages.ControlMessage() @@ -452,7 +465,7 @@ def test_metadata_holds_non_serializable_python_obj(py_object): assert obj is metadata_dict_with_obj["nested_obj"] -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_mode def test_tasks_hold_non_serializable_python_obj(py_object): message = messages.ControlMessage() diff --git a/tests/morpheus/messages/test_message_meta.py b/tests/morpheus/messages/test_message_meta.py index b5e2606976..db28ea80d7 100644 --- a/tests/morpheus/messages/test_message_meta.py +++ b/tests/morpheus/messages/test_message_meta.py @@ -37,10 +37,8 @@ def fixture_index_type(request: pytest.FixtureRequest) -> typing.Literal["normal @pytest.fixture(name="df", scope="function") def fixture_df( - use_cpp: bool, # pylint: disable=unused-argument - dataset: DatasetManager, - index_type: typing.Literal['normal', 'skip', 'dup', 'down', - 'updown']) -> typing.Union[cudf.DataFrame, pd.DataFrame]: + dataset: DatasetManager, index_type: typing.Literal['normal', 'skip', 'dup', 'down', + 'updown']) -> typing.Union[cudf.DataFrame, pd.DataFrame]: test_df = dataset["test_dataframe.jsonlines"] if (index_type == "normal"): @@ -296,7 +294,7 @@ def test_update_dataframe(df: DataFrameType): assert meta.get_data()[col_new_int_name].isin(col_new_int).all() # pylint: disable=unsubscriptable-object -@pytest.mark.use_cpp +@pytest.mark.gpu_mode def test_pandas_df_cpp(dataset_pandas: DatasetManager): """ Test for issue #821, calling the `df` property returns an empty cudf dataframe. @@ -324,12 +322,12 @@ def test_cast(config: Config, dataset: DatasetManager): # pylint: disable=unuse @pytest.mark.use_pandas -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_cast_python_to_cpp(dataset: DatasetManager): """ Test that we can cast a python MessageMeta to a C++ MessageMeta """ - df = dataset["test_dataframe.jsonlines"] + df = dataset["filter_probs.csv"] py_meta = MessageMeta(df) assert isinstance(py_meta, MessageMeta) @@ -343,12 +341,12 @@ def test_cast_python_to_cpp(dataset: DatasetManager): @pytest.mark.use_pandas -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_cast_cpp_to_python(dataset: DatasetManager): """ Test that we can cast a a C++ MessageMeta to a python MessageMeta """ - df = dataset["test_dataframe.jsonlines"] + df = dataset["filter_probs.csv"] cpp_meta = MessageMetaCpp(df) py_meta = MessageMeta(cpp_meta) diff --git a/tests/morpheus/messages/test_messages.py b/tests/morpheus/messages/test_messages.py index 6c376f7e54..9fb99f1fd5 100644 --- a/tests/morpheus/messages/test_messages.py +++ b/tests/morpheus/messages/test_messages.py @@ -13,9 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import importlib -import os - import cupy as cp import pytest @@ -23,6 +20,7 @@ import morpheus._lib.messages as _messages import morpheus.config +import morpheus.utils.type_utils from morpheus import messages from morpheus.messages.memory import tensor_memory @@ -96,18 +94,6 @@ def check_all_messages(should_be_cpp: bool, no_cpp_class: bool): check_message(messages.ResponseMemoryAE, None, should_be_cpp, no_cpp_class, **{"count": 1, "probs": cp_array}) +@pytest.mark.gpu_mode def test_constructor_cpp(): check_all_messages(morpheus.config.CppConfig.get_should_use_cpp(), False) - - -@pytest.mark.reload_modules(morpheus.config) -@pytest.mark.usefixtures("reload_modules", "restore_environ") -def test_constructor_env(): - # Set the NO_CPP flag which should disable C++ regardless - os.environ['MORPHEUS_NO_CPP'] = '1' - - # Reload the CppConfig class just in case - importlib.reload(morpheus.config) - - # Check all messages. Should be False regardless due to the environment variable - check_all_messages(False, False) diff --git a/tests/morpheus/messages/test_tensor_memory.py b/tests/morpheus/messages/test_tensor_memory.py index e3f072277c..7e8d3be655 100644 --- a/tests/morpheus/messages/test_tensor_memory.py +++ b/tests/morpheus/messages/test_tensor_memory.py @@ -16,14 +16,13 @@ import os import string +import types import typing -import cupy as cp import numpy as np import pytest from _utils import TEST_DIRS -from morpheus.config import Config from morpheus.messages.memory.inference_memory import InferenceMemory from morpheus.messages.memory.inference_memory import InferenceMemoryAE from morpheus.messages.memory.inference_memory import InferenceMemoryFIL @@ -33,6 +32,7 @@ from morpheus.messages.memory.response_memory import ResponseMemoryProbs from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import NDArrayType INPUT_FILE = os.path.join(TEST_DIRS.tests_data_dir, 'filter_probs.csv') @@ -40,14 +40,14 @@ # pylint: disable=unused-argument -def compare_tensors(tensors1: typing.Dict[str, cp.ndarray], tensors2: typing.Dict[str, cp.ndarray]): +def compare_tensors(tensors1: typing.Dict[str, NDArrayType], tensors2: typing.Dict[str, NDArrayType]): assert sorted(tensors1.keys()) == sorted(tensors2.keys()) for (k, val1) in tensors1.items(): assert (val1 == tensors2[k]).all() -def check_tensor_memory(cls: type, count: int, tensors: typing.Dict[str, cp.ndarray]): - other_tensors = {'ones': cp.ones(count), 'zeros': cp.zeros(count)} +def check_tensor_memory(cls: type, count: int, tensors: typing.Dict[str, NDArrayType], array_pkg: types.ModuleType): + other_tensors = {'ones': array_pkg.ones(count), 'zeros': array_pkg.zeros(count)} mem = cls(count=count) assert mem.count == count @@ -73,27 +73,43 @@ def check_tensor_memory(cls: type, count: int, tensors: typing.Dict[str, cp.ndar cls(count, tensors) -def test_tensor_memory(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +def check_response_memory_probs(cls: type, array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) + count = test_data.shape[0] + + mem = cls(count=count, probs=test_data) + assert mem.count == count + compare_tensors(mem.get_tensors(), {'probs': test_data}) + assert (mem.get_output('probs') == test_data).all() + + with pytest.raises(TypeError): + cls(count, test_data) + + return mem + + +@pytest.mark.gpu_and_cpu_mode +def test_tensor_memory(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] # TensorMemory expects a dictionary of { : } # Convert each column into a 1d cupy array tensors = {} for col in range(test_data.shape[1]): - tensors[string.ascii_lowercase[col]] = cp.array(test_data[:, col]) + tensors[string.ascii_lowercase[col]] = array_pkg.array(test_data[:, col]) for cls in (TensorMemory, InferenceMemory, ResponseMemory): - check_tensor_memory(cls, count, tensors) + check_tensor_memory(cls=cls, count=count, tensors=tensors, array_pkg=array_pkg) -@pytest.mark.use_python -def test_inference_memory_ae(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +@pytest.mark.gpu_and_cpu_mode +def test_inference_memory_ae(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] - input_tensor = cp.array(test_data[:, 0]) - seq_ids = cp.array(test_data[:, 1]) + input_tensor = array_pkg.array(test_data[:, 0]) + seq_ids = array_pkg.array(test_data[:, 1]) mem = InferenceMemoryAE(count=count, inputs=input_tensor, seq_ids=seq_ids) assert mem.count == count @@ -105,12 +121,13 @@ def test_inference_memory_ae(config: Config): InferenceMemoryAE(count, input_tensor, seq_ids) # pylint: disable=too-many-function-args,missing-kwoa -def test_inference_memory_fil(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +@pytest.mark.gpu_and_cpu_mode +def test_inference_memory_fil(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] - input_0 = cp.array(test_data[:, 0]) - seq_ids = cp.array(test_data[:, 1]) + input_0 = array_pkg.array(test_data[:, 0]) + seq_ids = array_pkg.array(test_data[:, 1]) mem = InferenceMemoryFIL(count=count, input__0=input_0, seq_ids=seq_ids) assert mem.count == count @@ -122,13 +139,14 @@ def test_inference_memory_fil(config: Config): InferenceMemoryFIL(count, input_0, seq_ids) # pylint: disable=too-many-function-args,missing-kwoa -def test_inference_memory_nlp(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +@pytest.mark.gpu_and_cpu_mode +def test_inference_memory_nlp(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] - input_ids = cp.array(test_data[:, 0]) - input_mask = cp.array(test_data[:, 1]) - seq_ids = cp.array(test_data[:, 2]) + input_ids = array_pkg.array(test_data[:, 0]) + input_mask = array_pkg.array(test_data[:, 1]) + seq_ids = array_pkg.array(test_data[:, 2]) mem = InferenceMemoryNLP(count=count, input_ids=input_ids, input_mask=input_mask, seq_ids=seq_ids) assert mem.count == count @@ -141,24 +159,9 @@ def test_inference_memory_nlp(config: Config): InferenceMemoryNLP(count, input_ids, input_mask, seq_ids) # pylint: disable=too-many-function-args,missing-kwoa -def check_response_memory_probs_and_ae(cls: type): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) - count = test_data.shape[0] - - mem = cls(count=count, probs=test_data) - assert mem.count == count - compare_tensors(mem.get_tensors(), {'probs': test_data}) - assert (mem.get_output('probs') == test_data).all() - - with pytest.raises(TypeError): - cls(count, test_data) - - return mem - - -@pytest.mark.use_python -def test_response_memory_ae(config: Config, filter_probs_df: DataFrameType): - mem = check_response_memory_probs_and_ae(ResponseMemoryAE) +@pytest.mark.gpu_and_cpu_mode +def test_response_memory_ae(array_pkg: types.ModuleType, filter_probs_df: DataFrameType): + mem = check_response_memory_probs(ResponseMemoryAE, array_pkg) assert mem.user_id == "" assert mem.explain_df is None @@ -170,38 +173,43 @@ def test_response_memory_ae(config: Config, filter_probs_df: DataFrameType): assert (mem.explain_df.values == filter_probs_df.values).all() -def test_response_memory_probs(config: Config): - check_response_memory_probs_and_ae(ResponseMemoryProbs) +@pytest.mark.gpu_and_cpu_mode +def test_response_memory_probs(array_pkg: types.ModuleType): + check_response_memory_probs(ResponseMemoryProbs, array_pkg) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) -def test_constructor_length_error(config: Config, tensor_cls: type): +def test_constructor_length_error(array_pkg: types.ModuleType, tensor_cls: type): count = 10 - tensors = {"a": cp.zeros(count), "b": cp.ones(count)} + tensors = {"a": array_pkg.zeros(count), "b": array_pkg.ones(count)} with pytest.raises(ValueError): tensor_cls(count=count - 1, tensors=tensors) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) -def test_set_tensor_length_error(config: Config, tensor_cls: type): +def test_set_tensor_length_error(array_pkg: types.ModuleType, tensor_cls: type): count = 10 mem = tensor_cls(count=count) with pytest.raises(ValueError): - mem.set_tensor('a', cp.zeros(count + 1)) + mem.set_tensor('a', array_pkg.zeros(count + 1)) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) -def test_set_tensors_length_error(config: Config, tensor_cls: type): +def test_set_tensors_length_error(array_pkg: types.ModuleType, tensor_cls: type): count = 10 - tensors = {"a": cp.zeros(count), "b": cp.ones(count)} + tensors = {"a": array_pkg.zeros(count), "b": array_pkg.ones(count)} mem = tensor_cls(count=count + 1) with pytest.raises(ValueError): mem.set_tensors(tensors) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) @pytest.mark.parametrize( "shape", @@ -209,12 +217,12 @@ def test_set_tensors_length_error(config: Config, tensor_cls: type): (536870912, 1), # bytesize > 2**31 (134217728, 4) # bytesize > 2**31 and element count > 2**31 ]) -def test_tensorindex_bug(config: Config, tensor_cls: type, shape: typing.Tuple[int, int]): +def test_tensorindex_bug(array_pkg: types.ModuleType, tensor_cls: type, shape: typing.Tuple[int, int]): """ Test for issue #1004. We use a 32bit signed integer for shape and strides, but we shouldn't for element counts and byte sizes. """ - tensors = {"a": cp.zeros(shape, dtype=np.float32)} + tensors = {"a": array_pkg.zeros(shape, dtype=np.float32)} mem = tensor_cls(count=shape[0], tensors=tensors) tensor_a = mem.get_tensor('a') @@ -222,19 +230,24 @@ def test_tensorindex_bug(config: Config, tensor_cls: type, shape: typing.Tuple[i assert tensor_a.nbytes == shape[0] * shape[1] * 4 -def test_tensor_update(config: Config): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_update(array_pkg: types.ModuleType): tensor_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } tensor_memory = TensorMemory(count=3, tensors=tensor_data) # Update tensors with new data new_tensors = { - "input_ids": cp.array([4, 5, 6]), "input_mask": cp.array([1, 0, 1]), "segment_ids": cp.array([1, 1, 0]) + "input_ids": array_pkg.array([4, 5, 6]), + "input_mask": array_pkg.array([1, 0, 1]), + "segment_ids": array_pkg.array([1, 1, 0]) } tensor_memory.set_tensors(new_tensors) for (key, cp_arr) in new_tensors.items(): tensor = tensor_memory.get_tensor(key) - cp.allclose(tensor, cp_arr) + array_pkg.allclose(tensor, cp_arr) diff --git a/tests/morpheus/modules/test_from_control_message.py b/tests/morpheus/modules/test_from_control_message.py index b129bbbcc8..514dc68234 100644 --- a/tests/morpheus/modules/test_from_control_message.py +++ b/tests/morpheus/modules/test_from_control_message.py @@ -71,7 +71,7 @@ def test_get_module(): fn_constructor("FromControlMessageTest", config) # pylint: disable=not-callable -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("filename, expected_count", [("train_infer.json", 0), ("train.json", 0)], indirect=["filename"]) def test_cm_with_no_payload(config, filename, expected_count): @@ -97,7 +97,7 @@ def test_cm_with_no_payload(config, filename, expected_count): assert len(sink_stage.get_messages()) == expected_count -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("filename, expected_count", [("train_infer.json", 2), ("train.json", 1)], indirect=["filename"]) def test_cm_with_with_payload(config, filename, expected_count): diff --git a/tests/morpheus/modules/test_payload_batcher.py b/tests/morpheus/modules/test_payload_batcher.py index 02acd6b8ee..8fa39b18a5 100644 --- a/tests/morpheus/modules/test_payload_batcher.py +++ b/tests/morpheus/modules/test_payload_batcher.py @@ -83,7 +83,7 @@ def test_get_module(): assert isinstance(module_instance, mrc.core.segment.SegmentModule) -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize( "max_batch_size, raise_on_failure, group_by_columns, disable_max_batch_size, timestamp_column_name, " "timestamp_pattern, period, expected_count, expected_exception", @@ -193,7 +193,7 @@ def test_custom_params(config, assert len(sink_stage.get_messages()) == expected_count -@pytest.mark.use_cpp +@pytest.mark.gpu_mode def test_default_params(config, filter_probs_df): pipe = Pipeline(config) diff --git a/tests/morpheus/modules/test_to_control_message.py b/tests/morpheus/modules/test_to_control_message.py index 96f91a2fee..ce2218b8aa 100644 --- a/tests/morpheus/modules/test_to_control_message.py +++ b/tests/morpheus/modules/test_to_control_message.py @@ -61,7 +61,7 @@ def test_get_module(): assert isinstance(module_instance, mrc.core.segment.SegmentModule) -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("expected_count", [1, 2]) def test_to_control_message_module(config, filter_probs_df, expected_count): dataframes = [filter_probs_df for _ in range(expected_count)] diff --git a/tests/morpheus/parsers/test_windows_event_parser.py b/tests/morpheus/parsers/test_windows_event_parser.py index c90207612e..f287abfcd2 100644 --- a/tests/morpheus/parsers/test_windows_event_parser.py +++ b/tests/morpheus/parsers/test_windows_event_parser.py @@ -630,6 +630,7 @@ def test_windows_event_parser(): test_logs = fh.readlines() test_input = cudf.Series(test_logs) test_output_df = wep.parse(test_input) + for parsed_rec in test_output_df.to_records(): eventcode = parsed_rec["eventcode"] validate_func = VALIDATE_DICT.get(eventcode, unknown_record_type) diff --git a/tests/morpheus/pipeline/test_error_pipe.py b/tests/morpheus/pipeline/test_error_pipe.py index 7f1e044286..cb264f2231 100755 --- a/tests/morpheus/pipeline/test_error_pipe.py +++ b/tests/morpheus/pipeline/test_error_pipe.py @@ -16,7 +16,6 @@ import logging -import pandas as pd import pytest from _utils.stages.error_raiser import ErrorRaiserStage @@ -26,10 +25,12 @@ from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage +from morpheus.utils.type_aliases import DataFrameType +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("exception_cls", [RuntimeError, ValueError, NotImplementedError]) -def test_stage_raises_exception(config: Config, filter_probs_df: pd.DataFrame, exception_cls: type[Exception]): +def test_stage_raises_exception(config: Config, filter_probs_df: DataFrameType, exception_cls: type[Exception]): pipe = LinearPipeline(config) pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) error_raiser_stage = pipe.add_stage(ErrorRaiserStage(config, exception_cls=exception_cls)) @@ -43,7 +44,7 @@ def test_stage_raises_exception(config: Config, filter_probs_df: pd.DataFrame, e assert len(sink_stage.get_messages()) == 0 -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("delayed_start", [False, True]) def test_monitor_not_impl(config: Config, delayed_start: bool): diff --git a/tests/morpheus/pipeline/test_execution_modes.py b/tests/morpheus/pipeline/test_execution_modes.py new file mode 100755 index 0000000000..d740235a1b --- /dev/null +++ b/tests/morpheus/pipeline/test_execution_modes.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections.abc +import typing + +import mrc +import pytest +from mrc.core import operators as ops + +from _utils.stages.conv_msg import ConvMsg +from morpheus.config import Config +from morpheus.config import ExecutionMode +from morpheus.pipeline.execution_mode_mixins import CpuOnlyMixin +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin +from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stage_decorator import source +from morpheus.pipeline.stage_decorator import stage + + +@source +def gpu_only_source() -> collections.abc.Iterator[int]: + for i in range(10): + yield i + + +@source(execution_modes=(ExecutionMode.CPU, )) +def cpu_only_source() -> collections.abc.Iterator[int]: + for i in range(10): + yield i + + +@source(execution_modes=(ExecutionMode.CPU, ExecutionMode.GPU)) +def gpu_cpu_source() -> collections.abc.Iterator[int]: + for i in range(10): + yield i + + +@stage +def gpu_only_stage(message: typing.Any) -> typing.Any: + return message + + +@stage(execution_modes=(ExecutionMode.CPU, )) +def cpu_only_stage(message: typing.Any) -> typing.Any: + return message + + +@stage(execution_modes=(ExecutionMode.CPU, ExecutionMode.GPU)) +def gpu_cpu_stage(message: typing.Any) -> typing.Any: + return message + + +class BaseStage(PassThruTypeMixin, SinglePortStage): + + def accepted_types(self) -> typing.Tuple: + return (typing.Any, ) + + def supports_cpp_node(self) -> bool: + return False + + def on_data(self, data: typing.Any) -> typing.Any: + return data + + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: + node = builder.make_node(self.unique_name, ops.map(self.on_data)) + builder.make_edge(input_node, node) + + return node + + +class CpuOnlyStage(CpuOnlyMixin, BaseStage): + + @property + def name(self) -> str: + return "test-cpu-only-stage" + + +class GpuOnlyStage(BaseStage): + + @property + def name(self) -> str: + return "test-gpu-only-stage" + + +class GpuAndCpuStage(GpuAndCpuMixin, BaseStage): + + @property + def name(self) -> str: + return "test-gpu-and-cpu-stage" + + +@pytest.mark.parametrize("stage_cls, expected_modes", + [ + (GpuOnlyStage, {ExecutionMode.GPU}), + (CpuOnlyStage, {ExecutionMode.CPU}), + (GpuAndCpuStage, {ExecutionMode.GPU, ExecutionMode.CPU}), + (gpu_only_source, {ExecutionMode.GPU}), + (cpu_only_source, {ExecutionMode.CPU}), + (gpu_cpu_source, {ExecutionMode.GPU, ExecutionMode.CPU}), + (gpu_only_stage, {ExecutionMode.GPU}), + (cpu_only_stage, {ExecutionMode.CPU}), + (gpu_cpu_stage, {ExecutionMode.GPU, ExecutionMode.CPU}), + ]) +def test_execution_mode_mixins(stage_cls: type[ConvMsg], expected_modes: set): + # intentionally not using the config fixture so that we can set the execution mode manually + config = Config() + if ExecutionMode.CPU in expected_modes: + config.execution_mode = ExecutionMode.CPU + else: + config.execution_mode = ExecutionMode.GPU + + stage_ = stage_cls(config) + assert set(stage_.supported_execution_modes()) == expected_modes + + +@pytest.mark.parametrize("stage_cls, execution_mode", + [ + (GpuOnlyStage, ExecutionMode.CPU), + (gpu_only_source, ExecutionMode.CPU), + (gpu_only_stage, ExecutionMode.CPU), + (CpuOnlyStage, ExecutionMode.GPU), + (cpu_only_source, ExecutionMode.GPU), + (cpu_only_stage, ExecutionMode.GPU), + ]) +def test_unsupported_mode_error(stage_cls: type[ConvMsg], execution_mode: ExecutionMode): + # intentionally not using the config fixture so that we can set the execution mode and avoid iterating over + # python/C++ execution modes + config = Config() + config.execution_mode = execution_mode + + with pytest.raises(RuntimeError, match="Unsupported execution mode"): + stage_ = stage_cls(config) + stage_._pre_build(do_propagate=False) diff --git a/tests/morpheus/pipeline/test_file_in_out.py b/tests/morpheus/pipeline/test_file_in_out.py index a99e649821..b61e496bec 100755 --- a/tests/morpheus/pipeline/test_file_in_out.py +++ b/tests/morpheus/pipeline/test_file_in_out.py @@ -41,6 +41,7 @@ @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_type", ["csv", "jsonlines", "parquet"]) @pytest.mark.parametrize("use_pathlib", [False, True]) @pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) @@ -91,6 +92,7 @@ def test_file_rw_pipe(tmp_path: pathlib.Path, assert output_data.tolist() == validation_data.tolist() +@pytest.mark.gpu_and_cpu_mode def test_file_read_json(config: Config): src_file = os.path.join(TEST_DIRS.tests_data_dir, "simple.json") @@ -110,7 +112,7 @@ def test_file_read_json(config: Config): @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode @pytest.mark.usefixtures("chdir_tmpdir") def test_to_file_no_path(tmp_path: pathlib.Path, config: Config): """ @@ -131,6 +133,7 @@ def test_to_file_no_path(tmp_path: pathlib.Path, config: Config): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_type", ["csv", "jsonlines", "parquet"]) @pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) def test_file_rw_multi_segment_pipe(tmp_path: pathlib.Path, config: Config, input_type: str, output_type: str): @@ -165,6 +168,7 @@ def test_file_rw_multi_segment_pipe(tmp_path: pathlib.Path, config: Config, inpu @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_file", [ os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv"), @@ -189,6 +193,7 @@ def test_file_rw_index_pipe(tmp_path: pathlib.Path, config: Config, input_file: assert output_data.tolist() == validation_data.tolist() +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_file,extra_kwargs", [(os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv"), { "include_header": True, "include_index_col": False @@ -196,7 +201,6 @@ def test_file_rw_index_pipe(tmp_path: pathlib.Path, config: Config, input_file: "include_header": True }), (os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.jsonlines"), {})], ids=["CSV", "CSV_ID", "JSON"]) -@pytest.mark.usefixtures("use_cpp") def test_file_roundtrip(tmp_path: pathlib.Path, input_file: str, extra_kwargs: dict[str, typing.Any]): # Output file should be same type as input @@ -235,6 +239,7 @@ def test_read_cpp_compare(input_file: str): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) def test_file_rw_serialize_deserialize_pipe(tmp_path: pathlib.Path, config: Config, output_type: str): input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv") diff --git a/tests/morpheus/pipeline/test_pipe_viz.py b/tests/morpheus/pipeline/test_pipe_viz.py index da2b245886..156496ab56 100755 --- a/tests/morpheus/pipeline/test_pipe_viz.py +++ b/tests/morpheus/pipeline/test_pipe_viz.py @@ -25,6 +25,7 @@ from _utils.dataset_manager import DatasetManager from _utils.stages.conv_msg import ConvMsg from morpheus.cli.commands import RANKDIR_CHOICES +from morpheus.config import Config from morpheus.pipeline import LinearPipeline from morpheus.pipeline.pipeline import Pipeline from morpheus.pipeline.pipeline import PipelineState @@ -35,10 +36,8 @@ from morpheus.stages.preprocess.deserialize_stage import DeserializeStage -# pylint: disable=redefined-outer-name -@pytest.mark.use_cudf @pytest.fixture(name="viz_pipeline", scope="function") -def viz_pipeline_fixture(config, filter_probs_df): +def viz_pipeline_fixture(config: Config, dataset_cudf: DatasetManager): """ Creates a quick pipeline. """ @@ -46,9 +45,9 @@ def viz_pipeline_fixture(config, filter_probs_df): config.num_threads = 1 pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) + pipe.set_source(InMemorySourceStage(config, [dataset_cudf["filter_probs.csv"]])) pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(ConvMsg(config, filter_probs_df)) + pipe.add_stage(ConvMsg(config, dataset_cudf["filter_probs.csv"])) pipe.add_stage(AddClassificationsStage(config)) pipe.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) pipe.add_stage(InMemorySinkStage(config)) diff --git a/tests/morpheus/pipeline/test_pipeline.py b/tests/morpheus/pipeline/test_pipeline.py index aded507af6..56aa234ffd 100755 --- a/tests/morpheus/pipeline/test_pipeline.py +++ b/tests/morpheus/pipeline/test_pipeline.py @@ -38,7 +38,7 @@ from morpheus.utils.type_aliases import DataFrameType -class SourceTestStage(InMemorySourceStage): +class SourceTestStage(InMemorySourceStage): # pylint: disable=too-many-ancestors def __init__(self, config, diff --git a/tests/morpheus/pipeline/test_preallocation_pipe.py b/tests/morpheus/pipeline/test_preallocation_pipe.py index 53f85a46fc..f82eb97fe0 100755 --- a/tests/morpheus/pipeline/test_preallocation_pipe.py +++ b/tests/morpheus/pipeline/test_preallocation_pipe.py @@ -23,6 +23,7 @@ from _utils.stages.conv_msg import ConvMsg from morpheus.common import TypeId from morpheus.common import typeid_to_numpy_str +from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.pipeline import LinearPipeline @@ -32,10 +33,12 @@ from morpheus.stages.postprocess.add_scores_stage import AddScoresStage from morpheus.stages.postprocess.serialize_stage import SerializeStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage +from morpheus.utils.type_aliases import DataFrameType +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize('probs_type', [TypeId.FLOAT32, TypeId.FLOAT64]) -def test_preallocation(config, filter_probs_df, probs_type): +def test_preallocation(config: Config, filter_probs_df: DataFrameType, probs_type: TypeId): config.class_labels = ['frogs', 'lizards', 'toads', 'turtles'] probs_np_type = typeid_to_numpy_str(probs_type) expected_df = pd.DataFrame( @@ -61,8 +64,9 @@ def test_preallocation(config, filter_probs_df, probs_type): assert_results(comp_stage.get_results()) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize('probs_type', [TypeId.FLOAT32, TypeId.FLOAT64]) -def test_preallocation_multi_segment_pipe(config, filter_probs_df, probs_type): +def test_preallocation_multi_segment_pipe(config: Config, filter_probs_df: DataFrameType, probs_type: TypeId): """ Test ensures that when columns are needed for preallocation in a multi-segment pipeline, the preallocagtion will always be performed on the closest source to the stage that requested preallocation. Which in cases where the @@ -99,7 +103,7 @@ def test_preallocation_multi_segment_pipe(config, filter_probs_df, probs_type): assert_results(comp_stage.get_results()) -@pytest.mark.use_cpp +@pytest.mark.gpu_mode def test_preallocation_error(config, filter_probs_df): """ Verify that we get a raised exception when add_scores attempts to use columns that don't exist diff --git a/tests/morpheus/pipeline/test_stage_decorator.py b/tests/morpheus/pipeline/test_stage_decorator.py index 25b3095209..31a45d553d 100644 --- a/tests/morpheus/pipeline/test_stage_decorator.py +++ b/tests/morpheus/pipeline/test_stage_decorator.py @@ -30,6 +30,7 @@ from _utils import assert_results from morpheus.common import TypeId from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import MessageMeta from morpheus.pipeline import LinearPipeline from morpheus.pipeline.stage_decorator import ComputeSchemaType @@ -41,6 +42,7 @@ from morpheus.pipeline.stage_decorator import stage from morpheus.pipeline.stage_schema import StageSchema from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage +from morpheus.utils.type_aliases import DataFrameType def _get_annotation(type_: type, generator_type: type) -> type: @@ -59,7 +61,6 @@ def _mk_compute_schema_fn(return_type: type) -> ComputeSchemaType: return lambda schema: schema.output_schema.set_type(return_type) -@pytest.mark.use_python @pytest.mark.parametrize("generator_type", [None, typing.Iterator, typing.Generator, collections.abc.Iterator, collections.abc.Generator]) @pytest.mark.parametrize("return_type, is_prealloc", [(pd.DataFrame, True), (cudf.DataFrame, True), (MessageMeta, True), @@ -96,7 +97,6 @@ def test_source_gen() -> return_annotation: mock_compute_schema_fn.assert_called_once_with(schema) -@pytest.mark.use_python @pytest.mark.parametrize("src_cls", [WrappedFunctionSourceStage, PreAllocatedWrappedFunctionStage]) def test_wrapped_function_source_stage_not_generator_error(config: Config, src_cls: type): @@ -110,7 +110,6 @@ def test_source_gen() -> MessageMeta: compute_schema_fn=_mk_compute_schema_fn(MessageMeta)) -@pytest.mark.use_python @pytest.mark.parametrize("generator_type", [None, typing.Iterator, typing.Generator, collections.abc.Iterator, collections.abc.Generator]) @pytest.mark.parametrize("return_type, is_prealloc", [(pd.DataFrame, True), (cudf.DataFrame, True), (MessageMeta, True), @@ -133,7 +132,6 @@ def test_source_gen() -> return_annotation: assert schema.output_schema.get_type() is return_type -@pytest.mark.use_python def test_source_decorator_name(config: Config): @source @@ -144,7 +142,6 @@ def test_source_gen(value: int) -> int: assert source_stage.name == 'test_source_gen' # pylint: disable=no-member -@pytest.mark.use_python def test_source_decorator_explicit_name(config: Config): @source(name="source_gen") @@ -155,7 +152,6 @@ def test_source_gen(value: int) -> int: assert source_stage.name == 'source_gen' # pylint: disable=no-member -@pytest.mark.use_python def test_source_decorator_explicit_compute_schema(config: Config): mock_compute_schema_fn = mock.MagicMock() mock_compute_schema_fn.side_effect = _mk_compute_schema_fn(int) @@ -171,7 +167,6 @@ def test_source_gen(value: int) -> int: mock_compute_schema_fn.assert_called_once_with(schema) -@pytest.mark.use_python def test_source_decorator_no_annoation_error(config: Config): @source @@ -182,7 +177,6 @@ def test_source_gen(): test_source_gen(config) # pylint: disable=too-many-function-args -@pytest.mark.use_python def test_not_generator_error(config: Config): @source @@ -193,7 +187,6 @@ def test_fn() -> int: test_fn(config) # pylint: disable=too-many-function-args -@pytest.mark.use_python def test_source_stage_arg_no_value_error(config: Config): @source @@ -204,7 +197,6 @@ def test_source_gen(value: int) -> int: test_source_gen(config) -@pytest.mark.use_python @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), (typing.Union[float, int], float), (float, typing.Any), (typing.Any, float), @@ -220,7 +212,6 @@ def test_wrapped_function_stage_constructor(config: Config, accept_type: type, r assert wrapped_stage.accepted_types() == (accept_type, ) -@pytest.mark.use_python @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), (typing.Union[float, int], float), (float, float), (typing.Any, float), @@ -256,7 +247,6 @@ def source_fn(): assert schema.output_schema.get_type() is return_type -@pytest.mark.use_python def test_wrapped_function_stage_name(config: Config): def multiplier(message: MessageMeta, column: str, value: int | float) -> MessageMeta: @@ -273,7 +263,6 @@ def multiplier(message: MessageMeta, column: str, value: int | float) -> Message assert wrapped_stage.name == 'multiplier' -@pytest.mark.use_python @pytest.mark.parametrize("needed_columns", [None, { 'result': TypeId.INT64 @@ -295,7 +284,6 @@ def test_fn(message: MessageMeta) -> MessageMeta: assert wrapped_stage._needed_columns == expected_needed_columns -@pytest.mark.use_python @pytest.mark.parametrize("use_accept_type_annotation", [True, False]) @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), @@ -320,7 +308,6 @@ def test_fn(message) -> return_type: assert wrapped_stage.accepted_types() == (accept_type, ) -@pytest.mark.use_python @pytest.mark.parametrize("name", [None, "unittest-stage"]) def test_stage_decorator_name(config: Config, name: str): if name is None: @@ -336,7 +323,6 @@ def test_fn(message: float, value: float) -> float: assert wrapped_stage.name == expected_name -@pytest.mark.use_python @pytest.mark.parametrize("explicit_compute_schema_fn", [True, False]) @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), @@ -377,7 +363,6 @@ def test_stage(message: accept_type) -> return_type: assert schema.output_schema.get_type() is return_type -@pytest.mark.use_python def test_stage_decorator_no_annotation_error(config: Config): @stage @@ -388,7 +373,6 @@ def test_fn(message): test_fn(config) -@pytest.mark.use_python def test_stage_arg_no_value_error(config: Config): @stage @@ -399,7 +383,6 @@ def test_fn(message: float, value: float) -> float: test_fn(config) # pylint: disable=no-value-for-parameter -@pytest.mark.use_python @pytest.mark.parametrize("needed_columns", [None, { 'result': TypeId.INT64 @@ -417,15 +400,16 @@ def test_fn(message: MessageMeta) -> MessageMeta: assert wrapped_stage._needed_columns == expected_needed_columns -def test_end_to_end_pipe(config: Config, filter_probs_df: cudf.DataFrame): +@pytest.mark.gpu_and_cpu_mode +def test_end_to_end_pipe(config: Config, filter_probs_df: DataFrameType): - @source - def source_gen(dataframes: list[cudf.DataFrame]) -> collections.abc.Iterator[MessageMeta]: + @source(execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) + def source_gen(*, dataframes: list[DataFrameType]) -> collections.abc.Iterator[MessageMeta]: for df in dataframes: yield MessageMeta(df) - @stage - def multiplier(message: MessageMeta, column: str, value: int | float = 2.0) -> MessageMeta: + @stage(execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) + def multiplier(message: MessageMeta, *, column: str, value: int | float = 2.0) -> MessageMeta: with message.mutable_dataframe() as df: df[column] = df[column] * value @@ -436,7 +420,7 @@ def multiplier(message: MessageMeta, column: str, value: int | float = 2.0) -> M expected_df['v2'] = expected_df['v2'] * multipy_by * 2.0 pipe = LinearPipeline(config) - pipe.set_source(source_gen(config, dataframes=[filter_probs_df])) # pylint: disable=redundant-keyword-arg + pipe.set_source(source_gen(config, dataframes=[filter_probs_df])) # pylint: disable=too-many-function-args pipe.add_stage(multiplier(config, column='v2', value=multipy_by)) pipe.add_stage(multiplier(config, column='v2')) sink = pipe.add_stage(CompareDataFrameStage(config, expected_df)) diff --git a/tests/morpheus/stages/test_add_classifications_stage.py b/tests/morpheus/stages/test_add_classifications_stage.py index 2966888238..98eff9e698 100755 --- a/tests/morpheus/stages/test_add_classifications_stage.py +++ b/tests/morpheus/stages/test_add_classifications_stage.py @@ -16,23 +16,21 @@ import typing -import cupy as cp +import numpy as np +import pandas as pd import pytest import typing_utils -import cudf - from _utils.dataset_manager import DatasetManager -# pylint: disable=morpheus-incorrect-lib-from-import -from morpheus._lib.messages import TensorMemory as CppTensorMemory from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage @pytest.fixture(name="config") -def config_fixture(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config: Config): config.class_labels = ['frogs', 'lizards', 'toads'] yield config @@ -60,20 +58,20 @@ def test_constructor_errors(config: Config): AddClassificationsStage(config, labels=['missing']) -@pytest.mark.use_python -def test_add_labels(): +@pytest.mark.cpu_mode +def test_add_labels_with_contgrol_message(): class_labels = {0: "frogs", 1: "lizards", 2: "toads"} threshold = 0.6 - df = cudf.DataFrame([0, 1], columns=["dummy"]) - probs_array = cp.array([[0.1, 0.6, 0.8], [0.3, 0.61, 0.9]]) + df = pd.DataFrame([0, 1], columns=["dummy"]) + probs_array = np.array([[0.1, 0.6, 0.8], [0.3, 0.61, 0.9]]) probs_array_bool = probs_array > threshold cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(CppTensorMemory(count=2, tensors={"probs": probs_array})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array})) labeled_cm = AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=threshold) @@ -84,7 +82,7 @@ def test_add_labels(): # Too small of a probs array cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(CppTensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) with pytest.raises(RuntimeError): AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=threshold) diff --git a/tests/morpheus/stages/test_add_scores_stage.py b/tests/morpheus/stages/test_add_scores_stage.py index f00338c25a..8694632abe 100755 --- a/tests/morpheus/stages/test_add_scores_stage.py +++ b/tests/morpheus/stages/test_add_scores_stage.py @@ -16,23 +16,22 @@ import typing -import cupy as cp +import numpy as np +import pandas as pd import pytest import typing_utils -import cudf - -import morpheus._lib.messages as _messages from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage from morpheus.stages.postprocess.add_scores_stage import AddScoresStage @pytest.fixture(name='config') -def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def fixture_config(config: Config): config.class_labels = ['frogs', 'lizards', 'toads'] config.feature_length = 12 yield config @@ -61,16 +60,16 @@ def test_constructor_errors(config: Config): AddScoresStage(config, labels=['missing']) -@pytest.mark.use_python -def test_add_labels(): +@pytest.mark.cpu_mode +def test_add_labels_with_control_message(): class_labels = {0: "frogs", 1: "lizards", 2: "toads"} - df = cudf.DataFrame([0, 1], columns=["dummy"]) - probs_array = cp.array([[0.1, 0.5, 0.8], [0.2, 0.6, 0.9]]) + df = pd.DataFrame([0, 1], columns=["dummy"]) + probs_array = np.array([[0.1, 0.5, 0.8], [0.2, 0.6, 0.9]]) cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(_messages.TensorMemory(count=2, tensors={"probs": probs_array})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array})) labeled_cm = AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=None) @@ -81,7 +80,7 @@ def test_add_labels(): # Too small of a probs array cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(_messages.TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) with pytest.raises(RuntimeError): AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=None) diff --git a/tests/morpheus/stages/test_appshield_source_stage.py b/tests/morpheus/stages/test_appshield_source_stage.py index f69983b2ea..03920bef8b 100755 --- a/tests/morpheus/stages/test_appshield_source_stage.py +++ b/tests/morpheus/stages/test_appshield_source_stage.py @@ -23,7 +23,8 @@ from pandas.testing import assert_frame_equal from _utils import TEST_DIRS -from morpheus.messages.message_meta import AppShieldMessageMeta +from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.stages.input.appshield_source_stage import AppShieldSourceStage from morpheus.utils.directory_watcher import DirectoryWatcher @@ -279,7 +280,7 @@ def test_files_to_dfs(cols_include, cols_exclude, plugins_include, meta_columns, @pytest.mark.parametrize( 'input_df_per_source', [{ - 'appshield': [ + 'appshield': pd.DataFrame({ 'PID': pd.Series(['304', '304', '444', '350', '360', '563'], index=[0, 1, 3, 0, 1, 3]), @@ -290,8 +291,7 @@ def test_files_to_dfs(cols_include, cols_exclude, plugins_include, meta_columns, pd.Series(['appshield', 'appshield', 'appshield', 'appshield', 'appshield', 'appshield'], index=[0, 1, 3, 0, 1, 3]) }), - ], - 'appshield-v2': [ + 'appshield-v2': pd.DataFrame({ 'PID': pd.Series(['304', '304', '444', '350', '360', '563'], index=[0, 1, 3, 0, 1, 3]), @@ -303,11 +303,21 @@ def test_files_to_dfs(cols_include, cols_exclude, plugins_include, meta_columns, 'appshield-v2', 'appshield-v2', 'appshield-v2', 'appshield-v2', 'appshield-v2', 'appshield-v2' ], index=[0, 1, 3, 0, 1, 3]) - }), - ] + }) }]) -def test_build_metadata(input_df_per_source): - appshield_message_metas = AppShieldSourceStage._build_metadata(input_df_per_source) +def test_build_messages(config: Config, tmp_path: str, input_df_per_source: dict): + expected_sources = sorted(input_df_per_source.keys()) + + input_glob = os.path.join(tmp_path, '*.json') + # These constructor arguments are not used by the _build_messages method + stage = AppShieldSourceStage(config, input_glob, ['unused'], ['unused']) + appshield_messages = stage._build_messages(input_df_per_source) + + assert len(appshield_messages) == len(expected_sources) + + actual_sources = [] + for message in appshield_messages: + assert isinstance(message, ControlMessage) + actual_sources.append(message.get_metadata('source')) - assert len(appshield_message_metas) == 2 - assert isinstance(appshield_message_metas[0], AppShieldMessageMeta) + assert sorted(actual_sources) == expected_sources diff --git a/tests/morpheus/stages/test_deserialize_stage_pipe.py b/tests/morpheus/stages/test_deserialize_stage_pipe.py index e9d2f9e317..a8f656fa65 100755 --- a/tests/morpheus/stages/test_deserialize_stage_pipe.py +++ b/tests/morpheus/stages/test_deserialize_stage_pipe.py @@ -29,7 +29,7 @@ @pytest.mark.use_cudf -@pytest.mark.usefixtures("use_cpp") +@pytest.mark.gpu_mode def test_fixing_non_unique_indexes(dataset: DatasetManager): # Set 2 ids equal to others df = dataset.dup_index(dataset["filter_probs.csv"], count=2) diff --git a/tests/morpheus/stages/test_file_source_stage.py b/tests/morpheus/stages/test_file_source_stage.py new file mode 100755 index 0000000000..19d2dacd51 --- /dev/null +++ b/tests/morpheus/stages/test_file_source_stage.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from _utils import TEST_DIRS +from morpheus.config import Config +from morpheus.config import ExecutionMode +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin +from morpheus.stages.input.file_source_stage import FileSourceStage + + +def test_execution_modes(config: Config): + assert issubclass(FileSourceStage, GpuAndCpuMixin) + stage = FileSourceStage(config, filename=os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv")) + + # we don't care about the order of the execution modes + assert set(stage.supported_execution_modes()) == {ExecutionMode.GPU, ExecutionMode.CPU} diff --git a/tests/morpheus/stages/test_file_source_stage_pipe.py b/tests/morpheus/stages/test_file_source_stage_pipe.py index 59f9c76d63..0f5c1fdb2e 100755 --- a/tests/morpheus/stages/test_file_source_stage_pipe.py +++ b/tests/morpheus/stages/test_file_source_stage_pipe.py @@ -25,6 +25,7 @@ from morpheus.common import FileTypes from morpheus.common import determine_file_type from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.io.deserializers import read_file_to_df from morpheus.pipeline import LinearPipeline from morpheus.stages.input.file_source_stage import FileSourceStage @@ -32,6 +33,7 @@ @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_file", [ os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv"), @@ -46,7 +48,7 @@ def test_file_source_stage_pipe(config: Config, input_file: str, filter_null: bo parser_kwargs = {} if determine_file_type(input_file) == FileTypes.JSON: # kwarg specific to pandas.read_json - parser_kwargs['convert_dates'] = False + parser_kwargs['convert_dates'] = config.execution_mode == ExecutionMode.CPU expected_df = read_file_to_df(file_name=input_file, filter_nulls=filter_null, diff --git a/tests/morpheus/stages/test_filter_detections_stage.py b/tests/morpheus/stages/test_filter_detections_stage.py index cd7f361007..d0bd75d622 100644 --- a/tests/morpheus/stages/test_filter_detections_stage.py +++ b/tests/morpheus/stages/test_filter_detections_stage.py @@ -16,13 +16,13 @@ import typing -import cupy as cp +import numpy as np import pytest import typing_utils -import morpheus._lib.messages as _messages from morpheus.common import FilterSource from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage @@ -31,7 +31,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) return cm @@ -45,11 +45,11 @@ def test_constructor(config): assert typing_utils.issubtype(ControlMessage, accepted_union) -@pytest.mark.use_cudf +@pytest.mark.use_pandas def test_filter_copy(config, filter_probs_df): fds = FilterDetectionsStage(config, threshold=0.5, filter_source=FilterSource.TENSOR) - probs = cp.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) + probs = np.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) mock_control_message = _make_control_message(filter_probs_df, probs) # All values are at or below the threshold so nothing should be returned @@ -57,7 +57,7 @@ def test_filter_copy(config, filter_probs_df): assert output_control_message is None # Only one row has a value above the threshold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.5, 0.8], [0.2, 0.4, 0.3], @@ -65,11 +65,11 @@ def test_filter_copy(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 1:1, :].to_cupy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 1:1, :].to_numpy().tolist() # Two adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -79,11 +79,11 @@ def test_filter_copy(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 2:3, :].to_cupy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 2:3, :].to_numpy().tolist() # Two non-adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -92,17 +92,17 @@ def test_filter_copy(config, filter_probs_df): [0.2, 0.4, 0.3], ]) - mask = cp.zeros(len(filter_probs_df), dtype=cp.bool_) + mask = np.zeros(len(filter_probs_df), dtype=np.bool_) mask[2] = True mask[4] = True mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - mask, :].to_cupy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + mask, :].to_numpy().tolist() -@pytest.mark.use_cudf +@pytest.mark.use_pandas @pytest.mark.parametrize('do_copy', [True, False]) @pytest.mark.parametrize('threshold', [0.1, 0.5, 0.8]) @pytest.mark.parametrize('field_name', ['v1', 'v2', 'v3', 'v4']) @@ -112,22 +112,19 @@ def test_filter_column(config, filter_probs_df, do_copy, threshold, field_name): copy=do_copy, filter_source=FilterSource.DATAFRAME, field_name=field_name) - expected_df = filter_probs_df.to_pandas() - expected_df = expected_df[expected_df[field_name] > threshold] + expected_df = filter_probs_df[filter_probs_df[field_name] > threshold] - probs = cp.zeros([len(filter_probs_df), 3], 'float') - - # All values are at or below the threshold + probs = np.zeros([len(filter_probs_df), 3], 'float') mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == expected_df.to_numpy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == expected_df.to_numpy().tolist() -@pytest.mark.use_cudf +@pytest.mark.use_pandas def test_filter_slice(config, filter_probs_df): fds = FilterDetectionsStage(config, threshold=0.5, filter_source=FilterSource.TENSOR) - probs = cp.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) + probs = np.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) # All values are at or below the threshold @@ -136,7 +133,7 @@ def test_filter_slice(config, filter_probs_df): assert len(output_control_message) == 0 # Only one row has a value above the threshold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.5, 0.8], [0.2, 0.4, 0.3], @@ -144,12 +141,11 @@ def test_filter_slice(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_slice(mock_control_message) - assert len(output_control_message) == 1 - assert output_control_message[0].payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 1:1, :].to_cupy().tolist() + assert output_control_message[0].payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 1:1, :].to_numpy().tolist() # Two adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -159,12 +155,11 @@ def test_filter_slice(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_slice(mock_control_message) - assert len(output_control_message) == 1 - assert output_control_message[0].payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 2:3, :].to_cupy().tolist() + assert output_control_message[0].payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 2:3, :].to_numpy().tolist() # Two non-adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -180,5 +175,5 @@ def test_filter_slice(config, filter_probs_df): assert control_msg1.payload().count == 1 assert control_msg2.payload().count == 1 - assert control_msg1.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[2:2, :].to_cupy().tolist() - assert control_msg2.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[4:4, :].to_cupy().tolist() + assert control_msg1.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[2:2, :].to_numpy().tolist() + assert control_msg2.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[4:4, :].to_numpy().tolist() diff --git a/tests/morpheus/stages/test_filter_detections_stage_pipe.py b/tests/morpheus/stages/test_filter_detections_stage_pipe.py index f2d6e7dcdb..72a6e1fb7c 100755 --- a/tests/morpheus/stages/test_filter_detections_stage_pipe.py +++ b/tests/morpheus/stages/test_filter_detections_stage_pipe.py @@ -71,13 +71,13 @@ def _test_filter_detections_stage_pipe(config: Config, def _test_filter_detections_control_message_stage_multi_segment_pipe(config: Config, - dataset_pandas: DatasetManager, + dataset: DatasetManager, copy: bool = True): threshold = 0.75 - input_df = dataset_pandas["filter_probs.csv"] + input_df = dataset["filter_probs.csv"] pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [cudf.DataFrame(input_df)])) + pipe.set_source(InMemorySourceStage(config, [input_df])) pipe.add_segment_boundary(MessageMeta) pipe.add_stage(DeserializeStage(config)) pipe.add_segment_boundary(data_type=ControlMessage) @@ -87,8 +87,7 @@ def _test_filter_detections_control_message_stage_multi_segment_pipe(config: Con pipe.add_segment_boundary(ControlMessage) pipe.add_stage(SerializeStage(config)) pipe.add_segment_boundary(MessageMeta) - comp_stage = pipe.add_stage( - CompareDataFrameStage(config, build_expected(dataset_pandas["filter_probs.csv"], threshold))) + comp_stage = pipe.add_stage(CompareDataFrameStage(config, build_expected(dataset["filter_probs.csv"], threshold))) pipe.run() assert_results(comp_stage.get_results()) @@ -108,6 +107,7 @@ def test_filter_detections_stage_pipe(config: Config, return _test_filter_detections_stage_pipe(config, dataset_pandas, do_copy, order, pipeline_batch_size, repeat) +@pytest.mark.slow @pytest.mark.parametrize('do_copy', [True, False]) def test_filter_detections_control_message_stage_multi_segment_pipe(config: Config, dataset_pandas: DatasetManager, diff --git a/tests/morpheus/stages/test_generate_viz_frames_stage.py b/tests/morpheus/stages/test_generate_viz_frames_stage.py index 879220d204..77da125263 100644 --- a/tests/morpheus/stages/test_generate_viz_frames_stage.py +++ b/tests/morpheus/stages/test_generate_viz_frames_stage.py @@ -21,10 +21,10 @@ import cudf -import morpheus._lib.messages as _messages from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory from morpheus.stages.postprocess.generate_viz_frames_stage import GenerateVizFramesStage @@ -32,7 +32,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) return cm diff --git a/tests/morpheus/stages/test_http_server_sink_stage.py b/tests/morpheus/stages/test_http_server_sink_stage.py index 9702ec1dd5..1f9359dbf2 100644 --- a/tests/morpheus/stages/test_http_server_sink_stage.py +++ b/tests/morpheus/stages/test_http_server_sink_stage.py @@ -89,7 +89,7 @@ def _custom_serializer(df: DataFrameType) -> str: @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("lines", [False, True]) @pytest.mark.parametrize("max_rows_per_response", [10000, 10]) @pytest.mark.parametrize("df_serializer_fn", [None, _custom_serializer]) diff --git a/tests/morpheus/stages/test_http_server_source_stage.py b/tests/morpheus/stages/test_http_server_source_stage.py index 268d6bbc29..b98c931eab 100644 --- a/tests/morpheus/stages/test_http_server_source_stage.py +++ b/tests/morpheus/stages/test_http_server_source_stage.py @@ -58,7 +58,7 @@ def join(self, timeout=None): @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.parametrize("lines", [False, True], ids=["json", "lines"]) @pytest.mark.parametrize("use_payload_to_df_fn", [False, True], ids=["no_payload_to_df_fn", "payload_to_df_fn"]) def test_generate_frames(config: Config, @@ -99,6 +99,9 @@ def test_generate_frames(config: Config, lines=lines, payload_to_df_fn=payload_to_df_fn) + if not use_payload_to_df_fn: + stage._set_default_payload_to_df_fn() + generate_frames = stage._generate_frames(mock_subscription) msg_queue = queue.SimpleQueue() @@ -155,7 +158,7 @@ def test_constructor_invalid_accept_status(config: Config, invalid_accept_status @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.parametrize( "lines", [False, pytest.param(True, marks=pytest.mark.skip(reason="https://github.com/rapidsai/cudf/issues/15820"))], diff --git a/tests/morpheus/stages/test_inference_stage.py b/tests/morpheus/stages/test_inference_stage.py index 262f84b3e3..10370210a1 100755 --- a/tests/morpheus/stages/test_inference_stage.py +++ b/tests/morpheus/stages/test_inference_stage.py @@ -22,9 +22,10 @@ import cudf -import morpheus._lib.messages as _messages from _utils.inference_worker import IW from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemory +from morpheus.messages import ResponseMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.inference.inference_stage import InferenceStage @@ -45,12 +46,11 @@ def _mk_control_message(mess_count=1, count=1): msg = ControlMessage() msg.payload(MessageMeta(df)) msg.tensors( - _messages.InferenceMemory( - count=total_tensor_count, - tensors={ - "probs": cp.random.rand(total_tensor_count, 2), - "seq_ids": cp.tile(cp.expand_dims(cp.arange(0, total_tensor_count), axis=1), (1, 3)) - })) + InferenceMemory(count=total_tensor_count, + tensors={ + "probs": cp.random.rand(total_tensor_count, 2), + "seq_ids": cp.tile(cp.expand_dims(cp.arange(0, total_tensor_count), axis=1), (1, 3)) + })) return msg @@ -95,14 +95,14 @@ def test_join(config): worker.join.assert_awaited_once() -@pytest.mark.use_python +@pytest.mark.gpu_mode def test_convert_one_response(): # Test ControlMessage # Test first branch where `inf.mess_count == inf.count` - mem = _messages.ResponseMemory(count=4, tensors={"probs": cp.zeros((4, 3))}) + mem = ResponseMemory(count=4, tensors={"probs": cp.zeros((4, 3))}) inf = _mk_control_message(mess_count=4, count=4) - res = _messages.ResponseMemory(count=4, tensors={"probs": cp.random.rand(4, 3)}) + res = ResponseMemory(count=4, tensors={"probs": cp.random.rand(4, 3)}) output = _mk_control_message(mess_count=4, count=4) output.tensors(mem) @@ -115,10 +115,9 @@ def test_convert_one_response(): # Test for the second branch inf = _mk_control_message(mess_count=2, count=3) inf.tensors().set_tensor("seq_ids", cp.array([[0], [1], [1]])) - res = _messages.ResponseMemory(count=3, - tensors={"probs": cp.array([[0, 0.6, 0.7], [5.6, 4.4, 9.2], [4.5, 6.7, 8.9]])}) + res = ResponseMemory(count=3, tensors={"probs": cp.array([[0, 0.6, 0.7], [5.6, 4.4, 9.2], [4.5, 6.7, 8.9]])}) - mem = _messages.ResponseMemory(count=2, tensors={"probs": cp.zeros((2, 3))}) + mem = ResponseMemory(count=2, tensors={"probs": cp.zeros((2, 3))}) output = _mk_control_message(mess_count=2, count=3) output.tensors(mem) cm = InferenceStageT._convert_one_response(output, inf, res) diff --git a/tests/morpheus/stages/test_kafka_source_stage_pipe.py b/tests/morpheus/stages/test_kafka_source_stage_pipe.py index cb5adda659..92d93a6c6a 100644 --- a/tests/morpheus/stages/test_kafka_source_stage_pipe.py +++ b/tests/morpheus/stages/test_kafka_source_stage_pipe.py @@ -39,6 +39,7 @@ from kafka import KafkaConsumer +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka def test_kafka_source_stage_pipe(config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics) -> None: input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.jsonlines") @@ -63,6 +64,7 @@ def test_kafka_source_stage_pipe(config: Config, kafka_bootstrap_servers: str, k assert_results(comp_stage.get_results()) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka def test_multi_topic_kafka_source_stage_pipe(config: Config, kafka_bootstrap_servers: str) -> None: input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.jsonlines") @@ -95,6 +97,7 @@ def test_multi_topic_kafka_source_stage_pipe(config: Config, kafka_bootstrap_ser assert_results(comp_stage.get_results()) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka @pytest.mark.parametrize('async_commits', [True, False]) @pytest.mark.parametrize('num_records', [10, 100, 1000]) @@ -150,6 +153,7 @@ def test_kafka_source_commit(num_records: int, assert actual_offset == expected_offset +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka @pytest.mark.parametrize('num_records', [1000]) def test_kafka_source_batch_pipe(config: Config, diff --git a/tests/morpheus/stages/test_linear_modules_stage.py b/tests/morpheus/stages/test_linear_modules_stage.py index 8209b96b6e..b0c16de59d 100755 --- a/tests/morpheus/stages/test_linear_modules_stage.py +++ b/tests/morpheus/stages/test_linear_modules_stage.py @@ -23,13 +23,14 @@ from morpheus.stages.general.linear_modules_stage import LinearModulesStage from morpheus.utils.module_utils import mrc_version -module_config = { - "module_id": "TestSimpleModule", "module_name": "test_simple_module", "namespace": "test_morpheus_modules" -} +@pytest.fixture(name="module_config") +def module_config_fixture(): + return {"module_id": "TestSimpleModule", "module_name": "test_simple_module", "namespace": "test_morpheus_modules"} -@pytest.mark.use_python -def test_constructor(config): + +@pytest.mark.gpu_and_cpu_mode +def test_constructor(config, module_config: dict): mod_stage = LinearModulesStage(config, module_config, input_port_name="test_in", output_port_name="test_out") @@ -44,8 +45,8 @@ def test_constructor(config): pytest.raises(NotImplementedError, mod_stage._get_cpp_module_node, None) -@pytest.mark.use_python -def test_build_single_before_module_registration(config): +@pytest.mark.gpu_and_cpu_mode +def test_build_single_before_module_registration(config, module_config: dict): mock_node = mock.MagicMock() mock_segment = mock.MagicMock() @@ -61,19 +62,20 @@ def test_build_single_before_module_registration(config): mod_stage._build_single(mock_segment, mock_input_stream) -def register_test_module(): +def register_test_module(id_postfix: str): registry = mrc.ModuleRegistry def module_init_fn(_: mrc.Builder): pass - registry.register_module("TestSimpleModule", "test_morpheus_modules", mrc_version, module_init_fn) + registry.register_module(f"TestSimpleModule_{id_postfix}", "test_morpheus_modules", mrc_version, module_init_fn) -@pytest.mark.use_python -def test_build_single_after_module_registration(config): +@pytest.mark.gpu_and_cpu_mode +def test_build_single_after_module_registration(config, module_config: dict): - register_test_module() + register_test_module(config.execution_mode.value) + module_config["module_id"] = f"{module_config['module_id']}_{config.execution_mode.value}" mock_node = mock.MagicMock() mock_segment = mock.MagicMock() diff --git a/tests/morpheus/stages/test_ml_flow_drift_stage.py b/tests/morpheus/stages/test_ml_flow_drift_stage.py index f5eaca0229..ff0f4d2a92 100644 --- a/tests/morpheus/stages/test_ml_flow_drift_stage.py +++ b/tests/morpheus/stages/test_ml_flow_drift_stage.py @@ -21,9 +21,9 @@ import pytest import typing_utils -import morpheus._lib.messages as _messages from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import MessageMeta +from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory from morpheus.stages.postprocess.ml_flow_drift_stage import MLFlowDriftStage @@ -31,7 +31,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) return cm @@ -46,7 +46,6 @@ def test_constructor(config): @pytest.mark.use_cudf -@pytest.mark.use_python def test_calc_drift(config, filter_probs_df): with patch("morpheus.stages.postprocess.ml_flow_drift_stage.mlflow.start_run"): labels = ["a", "b", "c"] diff --git a/tests/morpheus/stages/test_monitor_stage.py b/tests/morpheus/stages/test_monitor_stage.py index b6ff56c6b4..e50153e7e5 100755 --- a/tests/morpheus/stages/test_monitor_stage.py +++ b/tests/morpheus/stages/test_monitor_stage.py @@ -179,7 +179,7 @@ def test_log_level(mock_progress_sink: mock.MagicMock, assert mock_sink_on_completed.call_count == expected_call_count -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode def test_thread(config: Config, morpheus_log_level: int): """ Test ensures the monitor stage executes on the same thread as the parent stage diff --git a/tests/morpheus/stages/test_multi_port_modules_stage.py b/tests/morpheus/stages/test_multi_port_modules_stage.py index 31ac032546..ca4e3f35ec 100755 --- a/tests/morpheus/stages/test_multi_port_modules_stage.py +++ b/tests/morpheus/stages/test_multi_port_modules_stage.py @@ -51,7 +51,7 @@ def registered_module_conf(): yield registered_module_conf -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode def test_constructor(config, unregistered_module_conf): mod_stage = MultiPortModulesStage(config, diff --git a/tests/morpheus/stages/test_multi_processing_stage.py b/tests/morpheus/stages/test_multi_processing_stage.py index 470c27c1f8..f88ec3d7d8 100644 --- a/tests/morpheus/stages/test_multi_processing_stage.py +++ b/tests/morpheus/stages/test_multi_processing_stage.py @@ -26,6 +26,7 @@ from _utils import assert_results from _utils.dataset_manager import DatasetManager from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.pipeline import LinearPipeline @@ -54,6 +55,7 @@ def _process_df(df: pd.DataFrame, column: str, value: str) -> pd.DataFrame: return df +@pytest.mark.gpu_and_cpu_mode def test_create_stage_type_deduction(config: Config, dataset_pandas: DatasetManager): # Test create() with normal function @@ -110,26 +112,39 @@ def __init__(self, self._add_column_name = add_column_name self._shared_process_pool.set_usage(self.name, self._process_pool_usage) + self._execution_mode = c.execution_mode @property def name(self) -> str: return "derived-multi-processing-stage" + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.GPU, ExecutionMode.CPU) + def _on_data(self, data: ControlMessage) -> ControlMessage: input_df = data.payload().copy_dataframe() - pdf = input_df.to_pandas() + if self._execution_mode == ExecutionMode.GPU: + input_df = input_df.to_pandas() + partial_process_fn = partial(_process_df, column=self._add_column_name, value="Hello") - task = self._shared_process_pool.submit_task(self.name, partial_process_fn, pdf) + task = self._shared_process_pool.submit_task(self.name, partial_process_fn, input_df) + + df = task.result() + if self._execution_mode == ExecutionMode.GPU: + df = cudf.DataFrame.from_pandas(df) - df = cudf.DataFrame.from_pandas(task.result()) meta = MessageMeta(df) data.payload(meta) return data +@pytest.mark.gpu_and_cpu_mode def test_derived_stage_type_deduction(config: Config): mp_stage = DerivedMultiProcessingStage(c=config, process_pool_usage=0.1, add_column_name="new_column") @@ -142,13 +157,12 @@ def test_derived_stage_type_deduction(config: Config): def pandas_dataframe_generator(dataset_pandas: DatasetManager, count: int) -> Generator[pd.DataFrame, None, None]: - - df = dataset_pandas["csv_sample.csv"] for _ in range(count): - yield df + yield dataset_pandas["csv_sample.csv"] @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode def test_created_stage_pipe(config: Config, dataset_pandas: DatasetManager): config.num_threads = os.cpu_count() @@ -178,6 +192,7 @@ def test_created_stage_pipe(config: Config, dataset_pandas: DatasetManager): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode def test_derived_stage_pipe(config: Config, dataset_pandas: DatasetManager): config.num_threads = os.cpu_count() @@ -188,7 +203,7 @@ def test_derived_stage_pipe(config: Config, dataset_pandas: DatasetManager): expected_df[add_column_name] = "Hello" pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [cudf.DataFrame(input_df)])) + pipe.set_source(InMemorySourceStage(config, [input_df])) pipe.add_stage(DeserializeStage(config, ensure_sliceable_index=True)) pipe.add_stage(DerivedMultiProcessingStage(c=config, process_pool_usage=0.1, add_column_name=add_column_name)) pipe.add_stage(SerializeStage(config)) @@ -200,6 +215,7 @@ def test_derived_stage_pipe(config: Config, dataset_pandas: DatasetManager): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode def test_multiple_stages_pipe(config: Config, dataset_pandas: DatasetManager): config.num_threads = os.cpu_count() @@ -214,9 +230,8 @@ def test_multiple_stages_pipe(config: Config, dataset_pandas: DatasetManager): partial_fn = partial(_process_df, column="new_column_1", value="new_value") - @stage - def pdf_to_control_message_stage(pdf: pd.DataFrame) -> ControlMessage: - df = cudf.DataFrame.from_pandas(pdf) + @stage(execution_modes=(ExecutionMode.CPU, ExecutionMode.GPU)) + def pdf_to_control_message_stage(df: pd.DataFrame) -> ControlMessage: meta = MessageMeta(df) msg = ControlMessage() msg.payload(meta) diff --git a/tests/morpheus/stages/test_preprocess_fil_stage.py b/tests/morpheus/stages/test_preprocess_fil_stage.py index cb9c5045be..57456e804b 100644 --- a/tests/morpheus/stages/test_preprocess_fil_stage.py +++ b/tests/morpheus/stages/test_preprocess_fil_stage.py @@ -15,21 +15,17 @@ import typing -import cupy as cp import pytest import typing_utils -import cudf - from morpheus.config import Config from morpheus.config import ConfigFIL from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta from morpheus.stages.preprocess.preprocess_fil_stage import PreprocessFILStage @pytest.fixture(name='config') -def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def fixture_config(config: Config): config.feature_length = 1 config.fil = ConfigFIL() config.fil.feature_columns = ["data"] @@ -44,18 +40,3 @@ def test_constructor(config: Config): accepted_union = typing.Union[stage.accepted_types()] assert typing_utils.issubtype(ControlMessage, accepted_union) - - -def test_process_control_message(config: Config): - stage = PreprocessFILStage(config) - input_cm = ControlMessage() - df = cudf.DataFrame({"data": [1, 2, 3]}) - meta = MessageMeta(df) - input_cm.payload(meta) - - output_cm = stage.pre_process_batch(input_cm, stage._fea_length, stage.features) - assert cp.array_equal(output_cm.tensors().get_tensor("input__0"), cp.asarray(df.to_cupy())) - expect_seq_ids = cp.zeros((df.shape[0], 3), dtype=cp.uint32) - expect_seq_ids[:, 0] = cp.arange(0, df.shape[0], dtype=cp.uint32) - expect_seq_ids[:, 2] = stage._fea_length - 1 - assert cp.array_equal(output_cm.tensors().get_tensor("seq_ids"), expect_seq_ids) diff --git a/tests/morpheus/stages/test_preprocess_nlp_stage.py b/tests/morpheus/stages/test_preprocess_nlp_stage.py index 764f5e94c9..1672768cae 100644 --- a/tests/morpheus/stages/test_preprocess_nlp_stage.py +++ b/tests/morpheus/stages/test_preprocess_nlp_stage.py @@ -14,23 +14,17 @@ # limitations under the License. import typing -from unittest.mock import Mock -from unittest.mock import patch -import cupy as cp import pytest import typing_utils -import cudf - from morpheus.config import Config from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage @pytest.fixture(name='config') -def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def fixture_config(config: Config): config.class_labels = [ "address", "bank_acct", @@ -64,31 +58,3 @@ def test_constructor(config: Config): accepted_union = typing.Union[stage.accepted_types()] assert typing_utils.issubtype(ControlMessage, accepted_union) - - -@patch("morpheus.stages.preprocess.preprocess_nlp_stage.tokenize_text_series") -def test_process_control_message(mock_tokenize_text_series, config: Config): - mock_tokenized = Mock() - mock_tokenized.input_ids = cp.array([[1, 2], [1, 2]]) - mock_tokenized.input_mask = cp.array([[3, 4], [3, 4]]) - mock_tokenized.segment_ids = cp.array([[0, 0], [1, 1]]) - mock_tokenize_text_series.return_value = mock_tokenized - - stage = PreprocessNLPStage(config) - input_cm = ControlMessage() - df = cudf.DataFrame({"data": ["a", "b", "c"]}) - meta = MessageMeta(df) - input_cm.payload(meta) - - output_cm = stage.pre_process_batch(input_cm, - stage._vocab_hash_file, - stage._do_lower_case, - stage._seq_length, - stage._stride, - stage._truncation, - stage._add_special_tokens, - stage._column) - assert output_cm.get_metadata("inference_memory_params") == {"inference_type": "nlp"} - assert cp.array_equal(output_cm.tensors().get_tensor("input_ids"), mock_tokenized.input_ids) - assert cp.array_equal(output_cm.tensors().get_tensor("input_mask"), mock_tokenized.input_mask) - assert cp.array_equal(output_cm.tensors().get_tensor("seq_ids"), mock_tokenized.segment_ids) diff --git a/tests/morpheus/stages/test_rss_source_stage_pipe.py b/tests/morpheus/stages/test_rss_source_stage_pipe.py index ab5a3f0951..84beb5d636 100644 --- a/tests/morpheus/stages/test_rss_source_stage_pipe.py +++ b/tests/morpheus/stages/test_rss_source_stage_pipe.py @@ -28,15 +28,13 @@ invalid_feed_input = os.path.join(TEST_DIRS.tests_data_dir, "rss_feed_atom.xm") -@pytest.mark.use_python def test_support_cpp_node(config): url_feed_input = "https://fake.nvidia.com/rss/HomePage.xml" - rss_source_stage = RSSSourceStage(config, feed_input=url_feed_input) + rss_source_stage = RSSSourceStage(config, feed_input=[url_feed_input]) assert rss_source_stage.supports_cpp_node() is False -@pytest.mark.use_python @pytest.mark.parametrize( "feed_input, batch_size, expected_count, enable_cache", [([valid_feed_input], 30, 1, False), ([valid_feed_input], 12, 3, True), @@ -61,9 +59,7 @@ def test_rss_source_stage_pipe(config: Config, assert len(sink_stage.get_messages()) == expected_count -# TODO(Devin): Remove before merge, this isn't a stage test, this is a test of RSSController -# @pytest.mark.use_python -# def test_invalid_input_rss_source_stage(config: Config): -# -# with pytest.raises(ValueError, match=f"Invalid URL or file path: {invalid_feed_input}"): -# RSSSourceStage(config, feed_input=[invalid_feed_input], interval_secs=1, cooldown_interval=500) +def test_invalid_input_rss_source_stage(config: Config): + + with pytest.raises(ValueError, match=f"Invalid URL or file path: {invalid_feed_input}"): + RSSSourceStage(config, feed_input=[invalid_feed_input], interval_secs=1, cooldown_interval=500) diff --git a/tests/morpheus/stages/test_serialize_stage.py b/tests/morpheus/stages/test_serialize_stage.py index 850950d4c0..73420668cb 100755 --- a/tests/morpheus/stages/test_serialize_stage.py +++ b/tests/morpheus/stages/test_serialize_stage.py @@ -16,25 +16,28 @@ import re +import pandas as pd import pytest -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.stages.postprocess.serialize_stage import SerializeStage -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_fixed_columns(config): - df1 = cudf.DataFrame() + """ + The serialize stage works in both GPU and CPU mode, however this test is only for CPU mode since it is testing the + CPU implementation of the stage. + """ + df1 = pd.DataFrame() df1['apples'] = range(0, 4) df1['pears'] = range(5, 9) df1['apple_sauce'] = range(4, 0, -1) cm1 = ControlMessage() cm1.payload(MessageMeta(df1)) - df2 = cudf.DataFrame() + df2 = pd.DataFrame() df2['apples'] = range(4, 7) df2['applause'] = range(9, 6, -1) df2['pears'] = range(7, 10) diff --git a/tests/morpheus/stages/test_timeseries_stage.py b/tests/morpheus/stages/test_timeseries_stage.py index 51bca65c06..981eaab104 100644 --- a/tests/morpheus/stages/test_timeseries_stage.py +++ b/tests/morpheus/stages/test_timeseries_stage.py @@ -21,10 +21,10 @@ import pytest import typing_utils -import morpheus._lib.messages as _messages from morpheus.config import Config from morpheus.config import ConfigAutoEncoder from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.timeseries_stage import TimeSeriesStage @@ -42,7 +42,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) cm.set_metadata("user_id", "test_user_id") return cm @@ -56,8 +56,7 @@ def test_constructor(config): assert typing_utils.issubtype(ControlMessage, accepted_union) -@pytest.mark.use_cudf -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_call_timeseries_user(config): stage = TimeSeriesStage(config) diff --git a/tests/morpheus/stages/test_triton_inference_stage.py b/tests/morpheus/stages/test_triton_inference_stage.py index 09dfaafb8a..abbd7ab262 100644 --- a/tests/morpheus/stages/test_triton_inference_stage.py +++ b/tests/morpheus/stages/test_triton_inference_stage.py @@ -25,6 +25,7 @@ from _utils import assert_results from _utils import mk_async_infer +from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import ConfigFIL from morpheus.config import PipelineModes @@ -122,16 +123,16 @@ def test_resource_pool_create_raises_error(): assert pool.borrow_obj() == 20 -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.parametrize("pipeline_mode", list(PipelineModes)) def test_stage_constructor_worker_class(config: Config, pipeline_mode: PipelineModes): config.mode = pipeline_mode - stage = TritonInferenceStage(config, model_name='test', server_url='test:0000') + stage = TritonInferenceStage(config, model_name='test', server_url='test:0000', use_shared_memory=True) worker = stage._get_inference_worker(ProducerConsumerQueue()) assert isinstance(worker, TritonInferenceWorker) -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.parametrize("pipeline_mode", list(PipelineModes)) @pytest.mark.parametrize("needs_logits", [True, False, None]) def test_stage_get_inference_worker(config: Config, pipeline_mode: PipelineModes, needs_logits: bool | None): @@ -142,7 +143,11 @@ def test_stage_get_inference_worker(config: Config, pipeline_mode: PipelineModes config.mode = pipeline_mode - stage = TritonInferenceStage(config, model_name='test', server_url='test:0000', needs_logits=needs_logits) + stage = TritonInferenceStage(config, + model_name='test', + server_url='test:0000', + needs_logits=needs_logits, + use_shared_memory=True) worker = stage._get_inference_worker(ProducerConsumerQueue()) assert isinstance(worker, TritonInferenceWorker) @@ -150,8 +155,7 @@ def test_stage_get_inference_worker(config: Config, pipeline_mode: PipelineModes @pytest.mark.slow -@pytest.mark.use_python -# @pytest.mark.parametrize('num_records', [1000, 2000, 4000]) +@pytest.mark.gpu_mode @pytest.mark.parametrize('num_records', [10]) @mock.patch('tritonclient.grpc.InferenceServerClient') def test_triton_stage_pipe(mock_triton_client, config, num_records): @@ -196,8 +200,13 @@ def test_triton_stage_pipe(mock_triton_client, config, num_records): pipe_cm.add_stage(DeserializeStage(config)) pipe_cm.add_stage(PreprocessFILStage(config)) pipe_cm.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe_cm.add_stage(AddScoresStage(config, prefix="score_")) + # Intentionally using use_shared_memory=True as this is the only way to use the Python impl + TritonInferenceStage(config, + model_name='abp-nvsmi-xgb', + server_url='test:0000', + force_convert_inputs=True, + use_shared_memory=True)) + pipe_cm.add_stage(AddScoresStage(config, prefix="score_", probs_type=TypeId.FLOAT64)) pipe_cm.add_stage(SerializeStage(config)) comp_stage = pipe_cm.add_stage(CompareDataFrameStage(config, expected_df)) diff --git a/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py b/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py index 199ff8319d..d63dbee4ca 100644 --- a/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py +++ b/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py @@ -17,16 +17,14 @@ import typing from unittest.mock import patch -import pandas as pd import pytest import yaml -import cudf - from morpheus.config import Config from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage from morpheus.stages.output.write_to_elasticsearch_stage import WriteToElasticsearchStage +from morpheus.utils.type_aliases import DataFrameType def connection_kwargs_func(kwargs): @@ -47,7 +45,6 @@ def connection_conf_file_fixture(tmp_path): yield connection_conf_file -@pytest.mark.use_python @pytest.mark.parametrize("conf_file, exception", [("connection_conf.yaml", FileNotFoundError), (None, Exception)]) def test_constructor_invalid_conf_file(config: Config, conf_file: str, @@ -56,7 +53,6 @@ def test_constructor_invalid_conf_file(config: Config, WriteToElasticsearchStage(config, index="t_index", connection_conf_file=conf_file) -@pytest.mark.use_python @patch("morpheus.controllers.elasticsearch_controller.Elasticsearch") def test_constructor_with_custom_func(config: Config, connection_conf_file: str): expected_connection_kwargs = { @@ -73,12 +69,12 @@ def test_constructor_with_custom_func(config: Config, connection_conf_file: str) assert stage._controller._connection_kwargs == expected_connection_kwargs -@pytest.mark.use_python +@pytest.mark.use_cudf @patch("morpheus.stages.output.write_to_elasticsearch_stage.ElasticsearchController") def test_write_to_elasticsearch_stage_pipe(mock_controller: typing.Any, connection_conf_file: str, config: Config, - filter_probs_df: typing.Union[cudf.DataFrame, pd.DataFrame]): + filter_probs_df: DataFrameType): mock_df_to_parallel_bulk_write = mock_controller.return_value.df_to_parallel_bulk_write mock_refresh_client = mock_controller.return_value.refresh_client @@ -92,14 +88,11 @@ def test_write_to_elasticsearch_stage_pipe(mock_controller: typing.Any, # Run the pipeline pipe.run() - if isinstance(filter_probs_df, cudf.DataFrame): - filter_probs_df = filter_probs_df.to_pandas() - expected_index = mock_df_to_parallel_bulk_write.call_args[1]["index"] - expected_df = mock_df_to_parallel_bulk_write.call_args[1]["df"] + actual_df = mock_df_to_parallel_bulk_write.call_args[1]["df"] mock_refresh_client.assert_called_once() mock_df_to_parallel_bulk_write.assert_called_once() assert expected_index == "t_index" - assert expected_df.equals(filter_probs_df) + assert actual_df.equals(filter_probs_df.to_pandas()) diff --git a/tests/morpheus/stages/test_write_to_file_stage.py b/tests/morpheus/stages/test_write_to_file_stage.py deleted file mode 100755 index 002d3d4808..0000000000 --- a/tests/morpheus/stages/test_write_to_file_stage.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from unittest import mock - -import pytest - -from _utils import TEST_DIRS -from morpheus.config import Config -from morpheus.pipeline import LinearPipeline -from morpheus.stages.input.file_source_stage import FileSourceStage -from morpheus.stages.output.write_to_file_stage import WriteToFileStage - - -@pytest.mark.use_python -@pytest.mark.parametrize("flush", [False, True]) -@pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) -def test_file_rw_pipe(tmp_path: str, config: Config, output_type: str, flush: bool): - """ - Test the flush functionality of the WriteToFileStage. - """ - input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv") - out_file = os.path.join(tmp_path, f'results.{output_type}') - - # This currently works because the FileSourceStage doesn't use the builtin open function, but WriteToFileStage does - mock_open = mock.mock_open() - with mock.patch('builtins.open', mock_open): - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=input_file)) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False, flush=flush)) - pipe.run() - - assert not os.path.exists(out_file) - assert mock_open().flush.called == flush diff --git a/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py b/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py index 94b17a196c..56f9a7dcff 100644 --- a/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py +++ b/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py @@ -14,12 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import types import typing import pytest -import cudf - from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from morpheus.pipeline.linear_pipeline import LinearPipeline @@ -33,9 +32,10 @@ @pytest.mark.kafka -@pytest.mark.use_cudf +@pytest.mark.gpu_and_cpu_mode def test_write_to_kafka_stage_pipe(config, - dataset_cudf: DatasetManager, + df_pkg: types.ModuleType, + dataset: DatasetManager, kafka_bootstrap_servers: str, kafka_consumer: "KafkaConsumer", kafka_topics: KafkaTopics) -> None: @@ -44,7 +44,7 @@ def test_write_to_kafka_stage_pipe(config, to ensure it works just as well with the C++ impls of the message classes. """ - filter_probs_df = dataset_cudf['filter_probs.csv'] + filter_probs_df = dataset['filter_probs.csv'] pipe = LinearPipeline(config) pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) pipe.add_stage(DeserializeStage(config)) @@ -59,9 +59,8 @@ def test_write_to_kafka_stage_pipe(config, kafka_messages = list(kafka_consumer) assert len(kafka_messages) == len(filter_probs_df) - output_df = cudf.io.read_json("\n".join(rec.value.decode("utf-8") for rec in kafka_messages), - lines=True).to_pandas() + output_df = df_pkg.read_json("\n".join(rec.value.decode("utf-8") for rec in kafka_messages), lines=True) assert len(output_df) == len(filter_probs_df) - dataset_cudf.assert_compare_df(filter_probs_df, output_df) + dataset.assert_compare_df(filter_probs_df, output_df) diff --git a/tests/morpheus/test_cli.py b/tests/morpheus/test_cli.py index 1f578e5990..0d108e0623 100755 --- a/tests/morpheus/test_cli.py +++ b/tests/morpheus/test_cli.py @@ -33,7 +33,6 @@ from morpheus.common import FilterSource from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage @@ -141,7 +140,6 @@ def config_warning_fixture(): @pytest.mark.reload_modules(commands) @pytest.mark.usefixtures("chdir_tmpdir", "reload_modules") -@pytest.mark.use_python class TestCLI: @pytest.mark.parametrize('cmd', @@ -211,15 +209,14 @@ def test_pipeline_ae(self, config, callback_values): config = obj["config"] assert config.mode == PipelineModes.AE - assert not CppConfig.get_should_use_cpp() assert config.class_labels == ["reconstruct_loss", "zscore"] assert config.model_max_batch_size == 1024 assert config.pipeline_batch_size == 1024 assert config.num_threads == 12 assert isinstance(config.ae, ConfigAutoEncoder) - config.ae.userid_column_name = "user_col" - config.ae.userid_filter = "user321" + assert config.ae.userid_column_name == "user_col" + assert config.ae.userid_filter == "user321" expected_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt')) assert config.ae.feature_columns == expected_columns diff --git a/tests/morpheus/test_config.py b/tests/morpheus/test_config.py index 9817b5ab09..f958ce2f8b 100755 --- a/tests/morpheus/test_config.py +++ b/tests/morpheus/test_config.py @@ -17,6 +17,7 @@ import json import logging import os +from dataclasses import FrozenInstanceError from unittest import mock import pytest @@ -109,6 +110,50 @@ def test_to_string(config): assert isinstance(json.loads(conf_str), dict) +def test_frozen(config: morpheus.config.Config): + assert not config.frozen + + # Ensure that it is safe to call freeze() multiple times + for _ in range(2): + config.freeze() + assert config.frozen + + +@pytest.mark.parametrize('use_attr', [False, True]) +def test_frozen_immutible(config: morpheus.config.Config, use_attr: bool): + """ + Test for the freeze functionality. + + There are currently two ways to bypass the freeze functionality: + 1. By accessing the __dict__ attribute of the Config object. + 2. Modifying any of the mutable objects in the Config object (ex: `config.class_labels.append('new_label')`). + """ + assert not config.frozen + + # ensure that we can set some attributes + config.feature_length = 45 + + # freeze the config, freezing the config via the attribute or method should have the same effect, the only + # difference is that it is safe to call freeze() multiple times, while setting the attribute will raise an exception + # just like attempting to set any other attribute on a frozen object + if use_attr: + config.frozen = True + else: + config.freeze() + + assert config.frozen + + with pytest.raises(FrozenInstanceError): + config.feature_length = 100 + + # ensure setattr also raises an exception + with pytest.raises(FrozenInstanceError): + setattr(config, 'feature_length', 100) + + # ensure the config still has the original value + assert config.feature_length == 45 + + def test_warning_model_batch_size_less_than_pipeline_batch_size(caplog: pytest.LogCaptureFixture): config = morpheus.config.Config() config.pipeline_batch_size = 256 diff --git a/tests/morpheus/utils/test_column_info.py b/tests/morpheus/utils/test_column_info.py index f117ca9d9f..fe147c6218 100644 --- a/tests/morpheus/utils/test_column_info.py +++ b/tests/morpheus/utils/test_column_info.py @@ -24,8 +24,6 @@ import pandas as pd import pytest -import cudf - from _utils import TEST_DIRS from morpheus.io.deserializers import read_file_to_df from morpheus.utils.column_info import ColumnInfo @@ -52,14 +50,6 @@ def azure_ad_logs_pdf_fixture(_azure_ad_logs_pdf: pd.DataFrame): yield _azure_ad_logs_pdf.copy(deep=True) -@pytest.fixture(name="azure_ad_logs_cdf", scope="function") -def azure_ad_logs_cdf_fixture(_azure_ad_logs_pdf: pd.DataFrame): - # cudf.from_pandas essentially does a deep copy, so we can use this to ensure that the source pandas df is not - # modified - yield cudf.from_pandas(_azure_ad_logs_pdf) - - -@pytest.mark.use_python def test_dataframe_input_schema_without_json_cols(azure_ad_logs_pdf: pd.DataFrame): assert len(azure_ad_logs_pdf.columns) == 16 @@ -106,7 +96,6 @@ def test_dataframe_input_schema_without_json_cols(azure_ad_logs_pdf: pd.DataFram process_dataframe(azure_ad_logs_pdf, schema2) -@pytest.mark.use_python def test_string_cat_column(): cities = pd.Series([ "New York", @@ -156,7 +145,6 @@ def test_string_cat_column(): assert actual.equals(expected) -@pytest.mark.use_python def test_string_join_column(): cities = pd.Series([ "Boston", @@ -175,7 +163,6 @@ def test_string_join_column(): assert actual.equals(expected) -@pytest.mark.use_python def test_column_info(): cities = pd.Series([ "Boston", @@ -193,7 +180,6 @@ def test_column_info(): assert string_join_col.name == "city" -@pytest.mark.use_python def test_date_column(): time_series = pd.Series([ "2022-08-29T21:21:41.645157Z", @@ -212,7 +198,6 @@ def test_date_column(): assert datetime_series.dtype == np.dtype("datetime64[ns]") -@pytest.mark.use_python def test_rename_column(): time_series = pd.Series([ "2022-08-29T21:21:41.645157Z", @@ -235,7 +220,6 @@ def convert_to_upper(df, column_name: str): return df[column_name].str.upper() -@pytest.mark.use_python def test_custom_column(): cities = pd.Series([ "New York", @@ -256,7 +240,6 @@ def test_custom_column(): assert actutal.equals(expected) -@pytest.mark.use_python def test_type_cast(): """ Test reproduces issue reported in #922 diff --git a/tests/morpheus/utils/test_directory_watcher.py b/tests/morpheus/utils/test_directory_watcher.py index d7943bfb29..cc7f3dcccd 100644 --- a/tests/morpheus/utils/test_directory_watcher.py +++ b/tests/morpheus/utils/test_directory_watcher.py @@ -22,7 +22,6 @@ from morpheus.utils.directory_watcher import DirectoryWatcher -@pytest.mark.use_python @pytest.mark.parametrize('watch_directory', [True]) @pytest.mark.parametrize('max_files', [-1]) @pytest.mark.parametrize('sort_glob', [True]) diff --git a/tests/morpheus/utils/test_inference_worker.py b/tests/morpheus/utils/test_inference_worker.py index 22af7bff23..cfbefe821c 100755 --- a/tests/morpheus/utils/test_inference_worker.py +++ b/tests/morpheus/utils/test_inference_worker.py @@ -19,10 +19,10 @@ import cudf -import morpheus._lib.messages as _messages from _utils.inference_worker import IW from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory from morpheus.stages.inference import inference_stage from morpheus.utils.producer_consumer_queue import ProducerConsumerQueue @@ -37,7 +37,7 @@ def test_constructor(): worker.stop() -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.usefixtures("config") def test_build_output_message(): @@ -58,7 +58,7 @@ def test_build_output_message(): input__0 = cp.array([[0.], [2.], [4.], [6.], [8.], [10.], [12.], [14.], [16.], [18.]]) seq_ids = cp.array([[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 0], [8, 0, 0], [9, 0, 0]]) - msg.tensors(_messages.TensorMemory(count=num_records, tensors={'input__0': input__0, 'seq_ids': seq_ids})) + msg.tensors(TensorMemory(count=num_records, tensors={'input__0': input__0, 'seq_ids': seq_ids})) output_message = worker.build_output_message(msg) diff --git a/tests/morpheus/utils/test_module_utils.py b/tests/morpheus/utils/test_module_utils.py index adcdc3e660..baf8027a9e 100644 --- a/tests/morpheus/utils/test_module_utils.py +++ b/tests/morpheus/utils/test_module_utils.py @@ -28,7 +28,6 @@ # pylint: disable=unused-argument,too-many-function-args -@pytest.mark.use_python def test_mrc_version(): assert len(mrc_version) == 3 assert isinstance(mrc_version, list) diff --git a/tests/morpheus/utils/test_type_utils.py b/tests/morpheus/utils/test_type_utils.py new file mode 100644 index 0000000000..ab06f39fcb --- /dev/null +++ b/tests/morpheus/utils/test_type_utils.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import types +import typing + +import pandas as pd +import pytest + +import cudf + +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_utils import df_type_str_to_exec_mode +from morpheus.utils.type_utils import df_type_str_to_pkg +from morpheus.utils.type_utils import exec_mode_to_df_type_str +from morpheus.utils.type_utils import get_df_class +from morpheus.utils.type_utils import get_df_pkg +from morpheus.utils.type_utils import is_cudf_type + + +@pytest.mark.parametrize("mode, expected", + [(ExecutionMode.GPU, cudf.DataFrame), (ExecutionMode.CPU, pd.DataFrame), + ("cudf", cudf.DataFrame), ("pandas", pd.DataFrame)]) +def test_get_df_class(mode: typing.Union[ExecutionMode, DataFrameModule], expected: types.ModuleType): + assert get_df_class(mode) is expected + + +@pytest.mark.parametrize("mode, expected", [(ExecutionMode.GPU, cudf), (ExecutionMode.CPU, pd), ("cudf", cudf), + ("pandas", pd)]) +def test_get_df_pkg(mode: typing.Union[ExecutionMode, DataFrameModule], expected: types.ModuleType): + assert get_df_pkg(mode) is expected + + +@pytest.mark.parametrize( + "obj, expected", + [ + (cudf.DataFrame(), True), + (cudf.Series(), True), + (cudf.Index([]), True), + (cudf.RangeIndex(0), True), + (pd.DataFrame(), False), + (pd.Series(), False), + (pd.Index([]), False), + (pd.RangeIndex(0), False), + (None, False), + (0, False), + ("test", False), + ], + ids=[ + "cudf.DataFrame", + "cudf.Series", + "cudf.Index", + "cudf.RangeIndex", + "pd.DataFrame", + "pd.Series", + "pd.Index", + "pd.RangeIndex", + "None", + "int", + "str" + ], +) +def test_is_cudf_type(obj: typing.Any, expected: bool): + assert is_cudf_type(obj) == expected + + +@pytest.mark.parametrize("df_type_str, expected", [("cudf", cudf), ("pandas", pd)], ids=["cudf", "pandas"]) +def test_df_type_str_to_pkg(df_type_str: DataFrameModule, expected: types.ModuleType): + assert df_type_str_to_pkg(df_type_str) is expected + + +@pytest.mark.parametrize("invalid_type_str", ["invalid", "cuDF", "Pandas"]) +def test_df_type_str_to_pkg_invalid(invalid_type_str: typing.Any): + with pytest.raises(ValueError, match="Invalid DataFrame type string"): + df_type_str_to_pkg(invalid_type_str) + + +@pytest.mark.parametrize("df_type_str, expected", [("cudf", ExecutionMode.GPU), ("pandas", ExecutionMode.CPU)], + ids=["cudf", "pandas"]) +def test_df_type_str_to_exec_mode(df_type_str: DataFrameModule, expected: ExecutionMode): + assert df_type_str_to_exec_mode(df_type_str) == expected + + +@pytest.mark.parametrize("invalid_type_str", ["invalid", "cuDF", "Pandas"]) +def test_df_type_str_to_exec_mode_invalid(invalid_type_str: typing.Any): + with pytest.raises(ValueError, match="Invalid DataFrame type string"): + df_type_str_to_exec_mode(invalid_type_str) + + +@pytest.mark.parametrize("exec_mode, expected", [(ExecutionMode.GPU, "cudf"), (ExecutionMode.CPU, "pandas")], + ids=["GPU", "CPU"]) +def test_exec_mode_to_df_type_str(exec_mode: ExecutionMode, expected: DataFrameModule): + assert exec_mode_to_df_type_str(exec_mode) == expected diff --git a/tests/morpheus_dfp/conftest.py b/tests/morpheus_dfp/conftest.py index 4609e4ceee..3bdc85bd39 100644 --- a/tests/morpheus_dfp/conftest.py +++ b/tests/morpheus_dfp/conftest.py @@ -60,12 +60,11 @@ def ae_feature_cols_fixture(): @pytest.fixture(name="config") -def config_fixture(config_no_cpp: Config, ae_feature_cols: typing.List[str]): +def config_fixture(config: Config, ae_feature_cols: typing.List[str]): """ The digital_fingerprinting production example utilizes the Auto Encoder config, and requires C++ execution disabled. """ from morpheus.config import ConfigAutoEncoder - config = config_no_cpp config.ae = ConfigAutoEncoder() config.ae.feature_columns = ae_feature_cols yield config @@ -87,25 +86,22 @@ def dfp_prod_in_sys_path( sys.path.append(example_dir) -@pytest.fixture(name="dfp_message_meta") -def dfp_message_meta_fixture(config, dataset_pandas: DatasetManager): - import pandas as pd +@pytest.fixture +def control_message(config, dataset_cudf: DatasetManager): + import cudf - from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta + from morpheus.messages import ControlMessage + from morpheus.messages import MessageMeta user_id = 'test_user' - df = dataset_pandas['filter_probs.csv'] - df[config.ae.timestamp_column_name] = pd.to_datetime([1683054498 + i for i in range(0, len(df) * 30, 30)], unit='s') + df = dataset_cudf['filter_probs.csv'] + timestamps = [1683054498 + i for i in range(0, len(df) * 30, 30)] + df[config.ae.timestamp_column_name] = cudf.to_datetime(timestamps, unit='s') df[config.ae.userid_column_name] = user_id - yield DFPMessageMeta(df, user_id) - -@pytest.fixture -def control_message(dfp_message_meta): - from morpheus.messages import ControlMessage message = ControlMessage() - message.payload(dfp_message_meta) - message.set_metadata("user_id", dfp_message_meta.user_id) + message.payload(MessageMeta(df)) + message.set_metadata("user_id", user_id) message.set_metadata("model", mock.MagicMock()) yield message diff --git a/tests/morpheus_dfp/modules/test_dfp_training.py b/tests/morpheus_dfp/modules/test_dfp_training.py index e4683c1ea2..4408e3bd15 100644 --- a/tests/morpheus_dfp/modules/test_dfp_training.py +++ b/tests/morpheus_dfp/modules/test_dfp_training.py @@ -21,6 +21,8 @@ from _utils import TEST_DIRS from _utils.dataset_manager import DatasetManager from morpheus.config import Config +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage @@ -50,8 +52,6 @@ def test_on_data(mock_train_test_split: mock.MagicMock, config: Config, dataset_pandas: DatasetManager, validation_size: float): - from morpheus.messages import ControlMessage - from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.stages.dfp_training import DFPTraining mock_ae.return_value = mock_ae @@ -63,10 +63,9 @@ def test_on_data(mock_train_test_split: mock.MagicMock, mock_validation_df = mock.MagicMock() mock_train_test_split.return_value = (train_df, mock_validation_df) - meta = DFPMessageMeta(df, 'Account-123456789') msg = ControlMessage() - msg.payload(meta) - msg.set_metadata("user_id", meta.user_id) + msg.payload(MessageMeta(df)) + msg.set_metadata("user_id", 'Account-123456789') stage = DFPTraining(config, validation_size=validation_size) results = stage.on_data(msg) diff --git a/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py b/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py index b39e05a03d..39dcfd7d6b 100644 --- a/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py +++ b/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py @@ -27,6 +27,7 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage MockedRequests = namedtuple("MockedRequests", ["get", "patch", "response"]) @@ -238,7 +239,6 @@ def test_on_data( databricks_env: dict, databricks_permissions: dict, tracking_uri: str): - from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage from morpheus_dfp.stages.dfp_mlflow_model_writer import conda_env @@ -272,11 +272,10 @@ def test_on_data( mock_model.prepare_df.return_value = df mock_model.get_anomaly_score.return_value = pd.Series(float(i) for i in range(len(df))) - meta = DFPMessageMeta(df, 'Account-123456789') msg = ControlMessage() - msg.payload(meta) + msg.payload(MessageMeta(df)) msg.set_metadata("model", mock_model) - msg.set_metadata("user_id", meta.user_id) + msg.set_metadata("user_id", 'Account-123456789') stage = DFPMLFlowModelWriterStage(config, databricks_permissions=databricks_permissions, timeout=10) assert stage._controller.on_data(msg) is msg # Should be a pass-thru diff --git a/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py b/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py index b8f7e8cd18..06d142f91c 100644 --- a/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py +++ b/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py @@ -21,9 +21,15 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.pipeline.single_port_stage import SinglePortStage +@pytest.fixture(name="train_df") +def train_df_fixture(control_message: ControlMessage) -> pd.DataFrame: + return control_message.payload().copy_dataframe().to_pandas() + + def build_mock_user_cache(user_id: str = 'test_user', train_df: pd.DataFrame = None, count: int = 10, @@ -81,109 +87,91 @@ def test_get_user_cache_miss(config: Config): assert results2 is results -def test_build_window_no_new( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_no_new(config: Config, control_message: ControlMessage): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') mock_cache = build_mock_user_cache() mock_cache.append_dataframe.return_value = False - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache - assert stage._build_window(dfp_message_meta) is None + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache + assert stage._build_window(control_message) is None -def test_build_window_not_enough_data( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_not_enough_data(config: Config, control_message: ControlMessage): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') mock_cache = build_mock_user_cache(count=3) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache - assert stage._build_window(dfp_message_meta) is None + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache + assert stage._build_window(control_message) is None -def test_build_window_min_increment( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_min_increment(config: Config, control_message: ControlMessage): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') mock_cache = build_mock_user_cache(count=5, total_count=30, last_train_count=25) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache - assert stage._build_window(dfp_message_meta) is None + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache + assert stage._build_window(control_message) is None -def test_build_window_invalid( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_invalid(config: Config, control_message: ControlMessage, train_df: pd.DataFrame): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') - train_df = dfp_message_meta.copy_dataframe() # exact values not important so long as they don't match the actual hash train_df['_row_hash'] = [-1 for _ in range(len(train_df))] mock_cache = build_mock_user_cache(train_df=train_df) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache with pytest.raises(RuntimeError): - stage._build_window(dfp_message_meta) + stage._build_window(control_message) -def test_build_window_overlap( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_overlap(config: Config, control_message: ControlMessage, train_df: pd.DataFrame): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') # Create an overlap - train_df = dfp_message_meta.copy_dataframe()[-5:] + train_df = train_df[-5:] train_df['_row_hash'] = pd.util.hash_pandas_object(train_df, index=False) mock_cache = build_mock_user_cache(train_df=train_df) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache with pytest.raises(RuntimeError): - stage._build_window(dfp_message_meta) + stage._build_window(control_message) @pytest.mark.parametrize('use_on_data', [True, False]) -def test_build_window( - config: Config, - use_on_data: bool, - dfp_message_meta: "DFPMessageMeta", # noqa: F821 - dataset_pandas: DatasetManager): - from morpheus.messages import ControlMessage +def test_build_window(config: Config, + use_on_data: bool, + control_message: ControlMessage, + dataset_pandas: DatasetManager, + train_df: pd.DataFrame): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') # Create an overlap - train_df = dfp_message_meta.copy_dataframe() train_df['_row_hash'] = pd.util.hash_pandas_object(train_df, index=False) mock_cache = build_mock_user_cache(train_df=train_df) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache # on_data is a thin wrapper around _build_window, results should be the same if use_on_data: - msg = stage.on_data(dfp_message_meta) + out_msg = stage.on_data(control_message) else: - msg = stage._build_window(dfp_message_meta) + out_msg = stage._build_window(control_message) - assert isinstance(msg, ControlMessage) - assert msg.get_metadata("user_id") == dfp_message_meta.user_id - assert msg.payload().count == len(dataset_pandas['filter_probs.csv']) - dataset_pandas.assert_df_equal(msg.payload().df, train_df) + assert isinstance(out_msg, ControlMessage) + assert out_msg.get_metadata("user_id") == control_message.get_metadata('user_id') + assert out_msg.payload().count == len(dataset_pandas['filter_probs.csv']) + dataset_pandas.assert_df_equal(out_msg.payload().df, train_df) diff --git a/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py b/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py index 8563fd7f9a..0dca35fd02 100644 --- a/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py +++ b/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py @@ -17,6 +17,7 @@ import os import typing import warnings +from collections import defaultdict import pytest @@ -24,6 +25,7 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.utils.type_utils import get_df_pkg_from_obj def test_constructor(config: Config): @@ -67,14 +69,24 @@ def test_extract_users(config: Config, from morpheus_dfp.stages.dfp_split_users_stage import DFPSplitUsersStage config.ae.userid_column_name = "From" config.ae.fallback_username = "testy_testerson" + ts_col = config.ae.timestamp_column_name input_file = os.path.join(TEST_DIRS.tests_data_dir, "examples/developer_guide/email_with_addresses_first_10.jsonlines") df = dataset[input_file] + df_pkg = get_df_pkg_from_obj(df) + + # When the file is read using pandas (as is the case in the actual DFP pipeline), the timestamp column is + # automatically converted to datetime objects. However cuDF doesn't do this and the column will contain integers. + # When `dataset` is returning pandas DFs this might still be the case if `input_file` is first read using cuDF and + # cached by the DatasetManager and then converted to pandas. + if df[ts_col].dtype == 'int64': + df[ts_col] = df_pkg.to_datetime(df[ts_col], unit='s') all_data = [] - expected_data = {} + expected_data = defaultdict(list) + with open(input_file, encoding='UTF-8') as fh: for line in fh: json_data = json.loads(line) @@ -85,11 +97,13 @@ def test_extract_users(config: Config, if len(only_users) > 0 and user_id not in only_users: continue + json_data[ts_col] = df_pkg.to_datetime(json_data[ts_col], unit='s') + if include_generic: all_data.append(json_data) if include_individual: - expected_data[user_id] = [json_data] + expected_data[user_id].append(json_data) if include_generic: expected_data[config.ae.fallback_username] = all_data @@ -114,9 +128,11 @@ def test_extract_users(config: Config, # Add one for the generic user assert len(results) == len(expected_data) for msg in results: - assert len(msg.df) == len(expected_data[msg.user_id]) - if msg.user_id != config.ae.fallback_username: - assert msg.df.iloc[0].to_dict() == expected_data[msg.user_id][0] + actual_df = msg.payload().df + user_id = msg.get_metadata('user_id') + assert len(actual_df) == len(expected_data[user_id]) + if user_id != config.ae.fallback_username: + assert actual_df.to_dict('records') == expected_data[user_id] def test_extract_users_none_to_empty(config: Config): diff --git a/tests/morpheus_dfp/test_dfp.py b/tests/morpheus_dfp/test_dfp.py index 521509369d..fa51270930 100755 --- a/tests/morpheus_dfp/test_dfp.py +++ b/tests/morpheus_dfp/test_dfp.py @@ -23,12 +23,12 @@ from _utils import TEST_DIRS from _utils import calc_error_val +from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import ConfigAutoEncoder from morpheus.config import PipelineModes from morpheus.messages import ControlMessage from morpheus.messages.message_meta import MessageMeta -from morpheus.messages.message_meta import UserMessageMeta from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage @@ -45,7 +45,7 @@ @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') @@ -97,7 +97,7 @@ def test_dfp_roleg(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morph sort_glob=True)) pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) + pipe.add_stage(AddScoresStage(config, probs_type=TypeId.FLOAT64)) pipe.add_stage( TimeSeriesStage(config, resolution="1m", @@ -131,7 +131,7 @@ def test_dfp_roleg(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morph @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') @@ -181,7 +181,7 @@ def test_dfp_user123(mock_ae: mock.MagicMock, config: Config, tmp_path: str, mor sort_glob=True)) pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) + pipe.add_stage(AddScoresStage(config, probs_type=TypeId.FLOAT64)) pipe.add_stage( TimeSeriesStage(config, resolution="1m", @@ -214,7 +214,7 @@ def test_dfp_user123(mock_ae: mock.MagicMock, config: Config, tmp_path: str, mor @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') @@ -255,7 +255,7 @@ def test_dfp_user123_multi_segment(mock_ae: mock.MagicMock, config: Config, tmp_ pipe = LinearPipeline(config) pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_segment_boundary(UserMessageMeta) # Boundary 1 + pipe.add_segment_boundary(ControlMessage) # Boundary 1 pipe.add_stage( train_ae_stage.TrainAEStage( config, @@ -268,7 +268,7 @@ def test_dfp_user123_multi_segment(mock_ae: mock.MagicMock, config: Config, tmp_ pipe.add_segment_boundary(ControlMessage) # Boundary 3 pipe.add_stage(AutoEncoderInferenceStage(config)) pipe.add_segment_boundary(ControlMessage) # Boundary 4 - pipe.add_stage(AddScoresStage(config)) + pipe.add_stage(AddScoresStage(config, probs_type=TypeId.FLOAT64)) pipe.add_segment_boundary(ControlMessage) # Boundary 5 pipe.add_stage( TimeSeriesStage(config, diff --git a/tests/morpheus_dfp/test_dfp_kafka.py b/tests/morpheus_dfp/test_dfp_kafka.py index d952b00ae7..98ef108350 100755 --- a/tests/morpheus_dfp/test_dfp_kafka.py +++ b/tests/morpheus_dfp/test_dfp_kafka.py @@ -27,6 +27,7 @@ from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from morpheus.cli import commands +from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import ConfigAutoEncoder from morpheus.config import PipelineModes @@ -50,7 +51,7 @@ @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.reload_modules([commands, preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules", "loglevel_debug") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') @@ -103,7 +104,7 @@ def test_dfp_roleg(mock_ae: mock.MagicMock, sort_glob=True)) pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) + pipe.add_stage(AddScoresStage(config, probs_type=TypeId.FLOAT64)) pipe.add_stage( TimeSeriesStage(config, resolution="1m", @@ -154,7 +155,7 @@ def test_dfp_roleg(mock_ae: mock.MagicMock, @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) @pytest.mark.usefixtures("reload_modules", "loglevel_debug") @mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') @@ -206,7 +207,7 @@ def test_dfp_user123(mock_ae: mock.MagicMock, sort_glob=True)) pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) + pipe.add_stage(AddScoresStage(config, probs_type=TypeId.FLOAT64)) pipe.add_stage( TimeSeriesStage(config, resolution="1m", diff --git a/tests/morpheus_llm/llm/test_vdb_upload_pipe.py b/tests/morpheus_llm/llm/test_vdb_upload_pipe.py index 88a51c631f..85394da57c 100755 --- a/tests/morpheus_llm/llm/test_vdb_upload_pipe.py +++ b/tests/morpheus_llm/llm/test_vdb_upload_pipe.py @@ -31,7 +31,6 @@ @pytest.mark.milvus -@pytest.mark.use_python @pytest.mark.use_pandas @pytest.mark.import_mod([ os.path.join(TEST_DIRS.examples_dir, 'llm/common'), diff --git a/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py b/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py index 4d3935091b..7cb381f15c 100644 --- a/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py +++ b/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py @@ -16,8 +16,6 @@ import os -import pytest - from _utils import TEST_DIRS from _utils import assert_results from _utils.dataset_manager import DatasetManager @@ -39,8 +37,6 @@ def _build_engine() -> LLMEngine: return engine -@pytest.mark.use_cudf -@pytest.mark.use_python def test_pipeline(config: Config, dataset_cudf: DatasetManager): test_data = os.path.join(TEST_DIRS.validation_data_dir, 'root-cause-validation-data-input.jsonlines') input_df = dataset_cudf[test_data] diff --git a/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py b/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py index 20c8bf243a..b39aa1920d 100755 --- a/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py +++ b/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py @@ -45,7 +45,7 @@ def get_test_df(num_input_rows): @pytest.mark.milvus -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("use_instance, num_input_rows, expected_num_output_rows, resource_kwargs, recreate", [(True, 5, 5, { "partition_name": "age_partition" diff --git a/tests/test_conftest.py b/tests/test_conftest.py index cc37b918d3..5856152771 100644 --- a/tests/test_conftest.py +++ b/tests/test_conftest.py @@ -21,18 +21,40 @@ import cudf from _utils.dataset_manager import DatasetManager +from morpheus.config import Config from morpheus.config import CppConfig +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_utils import exec_mode_to_df_type_str -@pytest.fixture(name="cpp_from_marker", scope="function") -def cpp_from_marker_fixture(request: pytest.FixtureRequest) -> bool: +def exec_mode_to_cpp_mode(exec_mode: ExecutionMode) -> bool: + return exec_mode == ExecutionMode.GPU - use_cpp = len([x for x in request.node.iter_markers("use_cpp") if "added_by" in x.kwargs]) > 0 - use_python = len([x for x in request.node.iter_markers("use_python") if "added_by" in x.kwargs]) > 0 - assert use_cpp != use_python +@pytest.fixture(name="exec_mode_from_marker", scope="function") +def exec_mode_from_marker_fixture(request: pytest.FixtureRequest) -> ExecutionMode: - return use_cpp + gpu_mode = len([x for x in request.node.iter_markers("gpu_mode") if "added_by" in x.kwargs]) > 0 + cpu_mode = len([x for x in request.node.iter_markers("cpu_mode") if "added_by" in x.kwargs]) > 0 + + assert gpu_mode != cpu_mode + + if gpu_mode: + return ExecutionMode.GPU + + return ExecutionMode.CPU + + +@pytest.fixture(name="cpp_mode_from_marker", scope="function") +def cpp_mode_from_marker_fixture(request: pytest.FixtureRequest) -> bool: + + gpu_mode = len([x for x in request.node.iter_markers("gpu_mode") if "added_by" in x.kwargs]) > 0 + cpu_mode = len([x for x in request.node.iter_markers("cpu_mode") if "added_by" in x.kwargs]) > 0 + + assert gpu_mode != cpu_mode + + return gpu_mode @pytest.fixture(name="df_type_from_marker", scope="function") @@ -117,78 +139,60 @@ def test_no_mark(): # === No Marks === -@pytest.mark.use_cpp -def test_mark_use_cpp(): +@pytest.mark.gpu_mode +def test_mark_gpu_mode(): assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python -def test_mark_use_python(): +@pytest.mark.cpu_mode +def test_mark_cpu_mode(): assert not CppConfig.get_should_use_cpp() -@pytest.mark.use_cpp -@pytest.mark.use_python -def test_mark_both(cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - - # === Marks and Config === -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("config") -def test_mark_and_config_use_cpp(): +def test_mark_and_config_gpu_mode(): assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python -@pytest.mark.usefixtures("config") -def test_mark_and_config_use_python(): +@pytest.mark.cpu_mode +def test_mark_and_config_cpu_mode(config: Config): assert not CppConfig.get_should_use_cpp() + assert config.execution_mode == ExecutionMode.CPU -@pytest.mark.use_cpp -@pytest.mark.use_python -@pytest.mark.usefixtures("config") -def test_mark_and_config_both(cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker +@pytest.mark.gpu_and_cpu_mode +def test_gpu_and_cpu_mode(config: Config, exec_mode_from_marker: ExecutionMode): + assert config.execution_mode == exec_mode_from_marker -@pytest.mark.usefixtures("config") -def test_mark_and_config_neither(cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker +def test_mark_and_config_neither(config: Config): + assert CppConfig.get_should_use_cpp() + assert config.execution_mode == ExecutionMode.GPU # === Fixture === -@pytest.mark.use_cpp -def test_fixture_use_cpp(use_cpp: bool): - assert use_cpp +@pytest.mark.gpu_mode +def test_fixture_gpu_mode(execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python -def test_fixture_use_python(use_cpp: bool): - assert not use_cpp +@pytest.mark.cpu_mode +def test_fixture_cpu_mode(execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() -@pytest.mark.use_cpp -@pytest.mark.use_python -def test_fixture_both(use_cpp: bool): - assert CppConfig.get_should_use_cpp() == use_cpp - - -def test_fixture_neither(use_cpp: bool): - assert CppConfig.get_should_use_cpp() == use_cpp +def test_fixture_neither(execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU + assert CppConfig.get_should_use_cpp() # === Config Fixture === -@pytest.mark.usefixtures("config_no_cpp") -def test_config_fixture_no_cpp(): - assert not CppConfig.get_should_use_cpp() - - -@pytest.mark.usefixtures("config_only_cpp") -def test_config_fixture_only_cpp(): +@pytest.mark.usefixtures("config") +def test_config_fixture(): assert CppConfig.get_should_use_cpp() @@ -197,67 +201,62 @@ class TestNoMarkerClass: def test_no_marker(self): assert CppConfig.get_should_use_cpp() - @pytest.mark.use_python - def test_python_marker(self): + @pytest.mark.cpu_mode + def test_python_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp - def test_cpp_marker(self): + @pytest.mark.gpu_mode + def test_cpp_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU assert CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp - @pytest.mark.use_python - def test_marker_both(self, cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - @pytest.mark.slow - def test_other_marker(self, use_cpp: bool): - assert CppConfig.get_should_use_cpp() == use_cpp + def test_other_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU + assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python +@pytest.mark.cpu_mode class TestPythonMarkerClass: def test_no_marker(self): assert not CppConfig.get_should_use_cpp() - def test_with_fixture(self, use_cpp: bool): - assert not use_cpp + def test_with_fixture(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() - @pytest.mark.use_python - def test_extra_marker(self): + @pytest.mark.cpu_mode + def test_extra_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp - def test_add_marker(self, cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - -@pytest.mark.use_cpp +@pytest.mark.gpu_mode class TestCppMarkerClass: def test_no_marker(self): assert CppConfig.get_should_use_cpp() - def test_with_fixture(self, use_cpp: bool): - assert use_cpp + def test_with_fixture(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU assert CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp + @pytest.mark.gpu_mode def test_extra_marker(self): assert CppConfig.get_should_use_cpp() - @pytest.mark.use_python - def test_add_marker(self, cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - # === DF Type === def test_df_type_no_marks(df_type, df_type_from_marker): assert df_type == df_type_from_marker +def test_df_type_matches_execution_mode(df_type: DataFrameModule, execution_mode: ExecutionMode): + assert df_type == exec_mode_to_df_type_str(execution_mode) + + @pytest.mark.use_pandas def test_df_type_pandas_marker(df_type): assert df_type == "pandas"