From 2b8f559a8c6e15df8c68eba30d221c8d7dba040e Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Fri, 20 Sep 2024 13:59:47 -0400 Subject: [PATCH 1/7] remove starter dfp code, tests, doc refs --- ci/release/update-version.sh | 1 - docs/source/basics/overview.rst | 3 - docs/source/cloud_deployment_guide.md | 41 --- docs/source/developer_guide/contributing.md | 2 +- .../guides/5_digital_fingerprinting.md | 55 +-- docs/source/getting_started.md | 30 -- docs/source/stages/morpheus_stages.md | 6 - examples/digital_fingerprinting/README.md | 47 --- .../production/README.md | 4 +- .../digital_fingerprinting/starter/README.md | 304 --------------- .../starter/run_cloudtrail_dfp.py | 157 -------- morpheus.code-workspace | 153 -------- python/morpheus/morpheus/cli/commands.py | 138 +------ .../inference/auto_encoder_inference_stage.py | 150 -------- .../stages/input/autoencoder_source_stage.py | 329 ----------------- .../stages/input/azure_source_stage.py | 180 --------- .../stages/input/cloud_trail_source_stage.py | 187 ---------- .../morpheus/stages/input/duo_source_stage.py | 175 --------- .../stages/preprocess/preprocess_ae_stage.py | 113 ------ .../stages/preprocess/train_ae_stage.py | 345 ------------------ scripts/validation/hammah/val-hammah-all.sh | 27 -- scripts/validation/hammah/val-hammah.sh | 132 ------- scripts/validation/val-run-pipeline.sh | 44 --- tests/benchmarks/README.md | 2 +- tests/benchmarks/e2e_test_configs.json | 12 +- tests/benchmarks/test_bench_e2e_pipelines.py | 55 --- tests/stages/test_preprocess_ae_stage.py | 64 ---- tests/test_cli.py | 254 +------------ tests/test_dfp.py | 304 --------------- tests/test_dfp_kafka.py | 255 ------------- tests/tests_data/dfp_roleg_anomaly_score.csv | 3 - tests/tests_data/dfp_roleg_exp_results.csv | 3 - tests/tests_data/dfp_roleg_tensor.csv | 3 - .../tests_data/dfp_user123_anomaly_score.csv | 3 - tests/tests_data/dfp_user123_exp_results.csv | 3 - tests/tests_data/dfp_user123_tensor.csv | 3 - 36 files changed, 28 insertions(+), 3559 deletions(-) delete mode 100644 examples/digital_fingerprinting/README.md delete mode 100644 examples/digital_fingerprinting/starter/README.md delete mode 100644 examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py delete mode 100644 python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py delete mode 100644 python/morpheus/morpheus/stages/input/autoencoder_source_stage.py delete mode 100644 python/morpheus/morpheus/stages/input/azure_source_stage.py delete mode 100644 python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py delete mode 100644 python/morpheus/morpheus/stages/input/duo_source_stage.py delete mode 100644 python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py delete mode 100644 python/morpheus/morpheus/stages/preprocess/train_ae_stage.py delete mode 100755 scripts/validation/hammah/val-hammah-all.sh delete mode 100755 scripts/validation/hammah/val-hammah.sh delete mode 100644 tests/stages/test_preprocess_ae_stage.py delete mode 100755 tests/test_dfp.py delete mode 100755 tests/test_dfp_kafka.py delete mode 100644 tests/tests_data/dfp_roleg_anomaly_score.csv delete mode 100644 tests/tests_data/dfp_roleg_exp_results.csv delete mode 100644 tests/tests_data/dfp_roleg_tensor.csv delete mode 100644 tests/tests_data/dfp_user123_anomaly_score.csv delete mode 100644 tests/tests_data/dfp_user123_exp_results.csv delete mode 100644 tests/tests_data/dfp_user123_tensor.csv diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 6651dcdb15..cc7b26d073 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -91,7 +91,6 @@ sed_runner "s/v${CURRENT_FULL_VERSION}-runtime/v${NEXT_FULL_VERSION}-runtime/g" examples/digital_fingerprinting/production/docker-compose.yml \ examples/digital_fingerprinting/production/Dockerfile sed_runner "s/v${CURRENT_FULL_VERSION}-runtime/v${NEXT_FULL_VERSION}-runtime/g" examples/digital_fingerprinting/production/Dockerfile -sed_runner "s|blob/branch-${CURRENT_SHORT_TAG}|blob/branch-${NEXT_SHORT_TAG}|g" examples/digital_fingerprinting/starter/README.md # examples/developer_guide sed_runner 's/'"VERSION ${CURRENT_FULL_VERSION}.*"'/'"VERSION ${NEXT_FULL_VERSION}"'/g' \ diff --git a/docs/source/basics/overview.rst b/docs/source/basics/overview.rst index ca1f8b6981..c6f6b2a348 100644 --- a/docs/source/basics/overview.rst +++ b/docs/source/basics/overview.rst @@ -52,15 +52,12 @@ run: --help Show this message and exit. Commands: - pipeline-ae Run the inference pipeline with an AutoEncoder model pipeline-fil Run the inference pipeline with a FIL model pipeline-nlp Run the inference pipeline with a NLP model pipeline-other Run a custom inference pipeline without a specific model type Currently, Morpheus pipeline can be operated in four different modes. - * ``pipeline-ae`` - * This pipeline mode is used to run training/inference on the AutoEncoder model. * ``pipeline-fil`` * This pipeline mode is used to run inference on FIL (Forest Inference Library) models such as XGBoost, RandomForestClassifier, etc. * ``pipeline-nlp`` diff --git a/docs/source/cloud_deployment_guide.md b/docs/source/cloud_deployment_guide.md index 1dac95c9ae..31641c365a 100644 --- a/docs/source/cloud_deployment_guide.md +++ b/docs/source/cloud_deployment_guide.md @@ -32,7 +32,6 @@ limitations under the License. - [Verify Model Deployment](#verify-model-deployment) - [Create Kafka Topics](#create-kafka-topics) - [Example Workflows](#example-workflows) - - [Run AutoEncoder Digital Fingerprinting Pipeline](#run-autoencoder-digital-fingerprinting-pipeline) - [Run NLP Phishing Detection Pipeline](#run-nlp-phishing-detection-pipeline) - [Run NLP Sensitive Information Detection Pipeline](#run-nlp-sensitive-information-detection-pipeline) - [Run FIL Anomalous Behavior Profiling Pipeline](#run-fil-anomalous-behavior-profiling-pipeline) @@ -424,46 +423,6 @@ helm install --set ngc.apiKey="$API_KEY" \ morpheus-sdk-client ``` - -### Run AutoEncoder Digital Fingerprinting Pipeline -The following AutoEncoder pipeline example shows how to train and validate the AutoEncoder model and write the inference results to a specified location. Digital fingerprinting has also been referred to as **HAMMAH (Human as Machine <> Machine as Human)**. -These use cases are currently implemented to detect user behavior changes that indicate a change from a human to a machine or a machine to a human, thus leaving a "digital fingerprint." The model is an ensemble of an autoencoder and fast Fourier transform reconstruction. - -Inference and training based on a user ID (`user123`). The model is trained once and inference is conducted on the supplied input entries in the example pipeline below. The `--train_data_glob` parameter must be removed for continuous training. - -```bash -helm install --set ngc.apiKey="$API_KEY" \ - --set sdk.args="morpheus --log_level=DEBUG run \ - --num_threads=2 \ - --edge_buffer_size=4 \ - --pipeline_batch_size=1024 \ - --model_max_batch_size=1024 \ - --use_cpp=False \ - pipeline-ae \ - --columns_file=data/columns_ae_cloudtrail.txt \ - --userid_filter=user123 \ - --feature_scaler=standard \ - --userid_column_name=userIdentitysessionContextsessionIssueruserName \ - --timestamp_column_name=event_dt \ - from-cloudtrail --input_glob=/common/models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --max_files=200 \ - train-ae --train_data_glob=/common/models/datasets/training-data/dfp-cloudtrail-*.csv \ - --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage \ - --seed 42 \ - preprocess \ - inf-pytorch \ - add-scores \ - timeseries --resolution=1m --zscore_threshold=8.0 --hot_start \ - monitor --description 'Inference Rate' --smoothing=0.001 --unit inf \ - serialize \ - to-file --filename=/common/data//cloudtrail-dfp-detections.csv --overwrite" \ - --namespace $NAMESPACE \ - \ - morpheus-sdk-client -``` - -For more information on the Digital Fingerprint use cases, refer to the starter example and a more production-ready example that can be found in the `examples` source directory. - ### Run NLP Phishing Detection Pipeline The following Phishing Detection pipeline examples use a pre-trained NLP model to analyze emails (body) and determine phishing or benign. Here is the sample data as shown below is used to pass as an input to the pipeline. diff --git a/docs/source/developer_guide/contributing.md b/docs/source/developer_guide/contributing.md index 38439e9d43..eee39eb9f4 100644 --- a/docs/source/developer_guide/contributing.md +++ b/docs/source/developer_guide/contributing.md @@ -347,7 +347,7 @@ Launching a full production Kafka cluster is outside the scope of this project; ### Pipeline Validation -To verify that all pipelines are working correctly, validation scripts have been added at `${MORPHEUS_ROOT}/scripts/validation`. There are scripts for each of the main workflows: Anomalous Behavior Profiling (ABP), Humans-as-Machines-Machines-as-Humans (HAMMAH), Phishing Detection (Phishing), and Sensitive Information Detection (SID). +To verify that all pipelines are working correctly, validation scripts have been added at `${MORPHEUS_ROOT}/scripts/validation`. There are scripts for each of the main workflows: Anomalous Behavior Profiling (ABP), Phishing Detection (Phishing), and Sensitive Information Detection (SID). To run all of the validation workflow scripts, use the following commands: diff --git a/docs/source/developer_guide/guides/5_digital_fingerprinting.md b/docs/source/developer_guide/guides/5_digital_fingerprinting.md index e64b8a91d4..7ada67d7c8 100644 --- a/docs/source/developer_guide/guides/5_digital_fingerprinting.md +++ b/docs/source/developer_guide/guides/5_digital_fingerprinting.md @@ -23,7 +23,7 @@ Every account, user, service, and machine has a digital fingerprint that represe To construct this digital fingerprint, we will be training unsupervised behavioral models at various granularities, including a generic model for all users in the organization along with fine-grained models for each user to monitor their behavior. These models are continuously updated and retrained over time​, and alerts are triggered when deviations from normality occur for any user​. ## Training Sources -The data we will want to use for the training and inference will be any sensitive system that the user interacts with, such as VPN, authentication and cloud services. The digital fingerprinting example (`examples/digital_fingerprinting/README.md`) included in Morpheus ingests logs from [AWS CloudTrail](https://docs.aws.amazon.com/cloudtrail/index.html), [Azure Active Directory](https://docs.microsoft.com/en-us/azure/active-directory/reports-monitoring/concept-sign-ins), and [Duo Authentication](https://duo.com/docs/adminapi). +The data we will want to use for the training and inference will be any sensitive system that the user interacts with, such as VPN, authentication and cloud services. The digital fingerprinting example (`examples/digital_fingerprinting/README.md`) included in Morpheus ingests logs from [Azure Active Directory](https://docs.microsoft.com/en-us/azure/active-directory/reports-monitoring/concept-sign-ins), and [Duo Authentication](https://duo.com/docs/adminapi). The location of these logs could be either local to the machine running Morpheus, a shared file system like NFS, or on a remote store such as [Amazon S3](https://aws.amazon.com/s3/). @@ -44,27 +44,13 @@ Adding a new source for the DFP pipeline requires defining five critical pieces: 1. A [`DataFrameInputSchema`](6_digital_fingerprinting_reference.md#dataframe-input-schema-dataframeinputschema) for the [`DFPFileToDataFrameStage`](6_digital_fingerprinting_reference.md#file-to-dataframe-stage-dfpfiletodataframestage) stage. 1. A [`DataFrameInputSchema`](6_digital_fingerprinting_reference.md#dataframe-input-schema-dataframeinputschema) for the [`DFPPreprocessingStage`](6_digital_fingerprinting_reference.md#preprocessing-stage-dfppreprocessingstage). -## DFP Examples -The DFP workflow is provided as two separate examples: a simple, "starter" pipeline for new users and a complex, "production" pipeline for full scale deployments. While these two examples both perform the same general tasks, they do so in very different ways. The following is a breakdown of the differences between the two examples. - -### The "Starter" Example - -This example is designed to simplify the number of stages and components and provide a fully contained workflow in a single pipeline. - -Key Differences: - * A single pipeline which performs both training and inference - * Requires no external services - * Can be run from the Morpheus CLI - -This example is described in more detail in `examples/digital_fingerprinting/starter/README.md`. - -### The "Production" Example +## Production Deployment Example This example is designed to illustrate a full-scale, production-ready, DFP deployment in Morpheus. It contains all of the necessary components (such as a model store), to allow multiple Morpheus pipelines to communicate at a scale that can handle the workload of an entire company. -Key Differences: +Key Features: * Multiple pipelines are specialized to perform either training or inference - * Requires setting up a model store to allow the training and inference pipelines to communicate + * Uses a model store to allow the training and inference pipelines to communicate * Organized into a docker-compose deployment for easy startup * Contains a Jupyter notebook service to ease development and debugging * Can be deployed to Kubernetes using provided Helm charts @@ -72,26 +58,9 @@ Key Differences: This example is described in `examples/digital_fingerprinting/production/README.md` as well as the rest of this document. -### DFP Features +## DFP Features -#### AWS CloudTrail -| Feature | Description | -| ------- | ----------- | -| `userIdentityaccessKeyId` | for example, `ACPOSBUM5JG5BOW7B2TR`, `ABTHWOIIC0L5POZJM2FF`, `AYI2CM8JC3NCFM4VMMB4` | -| `userAgent` | for example, `Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; Trident/5.1)`, `Mozilla/5.0 (Linux; Android 4.3.1) AppleWebKit/536.1 (KHTML, like Gecko) Chrome/62.0.822.0 Safari/536.1`, `Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10 7_0; rv:1.9.4.20) Gecko/2012-06-10 12:09:43 Firefox/3.8` | -| `userIdentitysessionContextsessionIssueruserName` | for example, `role-g` | -| `sourceIPAddress` | for example, `208.49.113.40`, `123.79.131.26`, `128.170.173.123` | -| `userIdentityaccountId` | for example, `Account-123456789` | -| `errorMessage` | for example, `The input fails to satisfy the constraints specified by an AWS service.`, `The specified subnet cannot be found in the VPN with which the Client VPN endpoint is associated.`, `Your account is currently blocked. Contact aws-verification@amazon.com if you have questions.` | -| `userIdentitytype` | for example, `FederatedUser` | -| `eventName` | for example, `GetSendQuota`, `ListTagsForResource`, `DescribeManagedPrefixLists` | -| `userIdentityprincipalId` | for example, `39c71b3a-ad54-4c28-916b-3da010b92564`, `0baf594e-28c1-46cf-b261-f60b4c4790d1`, `7f8a985f-df3b-4c5c-92c0-e8bffd68abbf` | -| `errorCode` | for example, success, `MissingAction`, `ValidationError` | -| `eventSource` | for example, `lopez-byrd.info`, `robinson.com`, `lin.com` | -| `userIdentityarn` | for example, `arn:aws:4a40df8e-c56a-4e6c-acff-f24eebbc4512`, `arn:aws:573fd2d9-4345-487a-9673-87de888e4e10`, `arn:aws:c8c23266-13bb-4d89-bce9-a6eef8989214` | -| `apiVersion` | for example, `1984-11-26`, `1990-05-27`, `2001-06-09` | - -#### Azure Active Directory +### Azure Active Directory | Feature | Description | | ------- | ----------- | | `appDisplayName` | for example, `Windows sign in`, `MS Teams`, `Office 365`​ | @@ -104,14 +73,14 @@ This example is described in `examples/digital_fingerprinting/production/README. | `location.countryOrRegion` | country or region name​ | | `location.city` | city name | -##### Derived Features +#### Derived Features | Feature | Description | | ------- | ----------- | | `logcount` | tracks the number of logs generated by a user within that day (increments with every log)​ | | `locincrement` | increments every time we observe a new city (`location.city`) in a user's logs within that day​ | | `appincrement` | increments every time we observe a new app (`appDisplayName`) in a user's logs within that day​ | -#### Duo Authentication +### Duo Authentication | Feature | Description | | ------- | ----------- | | `auth_device.name` | phone number​ | @@ -121,7 +90,7 @@ This example is described in `examples/digital_fingerprinting/production/README. | `reason` | reason for the results, for example, `User Cancelled`, `User Approved`, `User Mistake`, `No Response`​ | | `access_device.location.city` | city name | -##### Derived Features +#### Derived Features | Feature | Description | | ------- | ----------- | | `logcount` | tracks the number of logs generated by a user within that day (increments with every log)​ | @@ -133,16 +102,16 @@ DFP in Morpheus is accomplished via two independent pipelines: training and infe ![High Level Architecture](img/dfp_high_level_arch.png) -#### Training Pipeline +### Training Pipeline * Trains user models and uploads to the model store​ * Capable of training individual user models or a fallback generic model for all users​ -#### Inference Pipeline +### Inference Pipeline * Downloads user models from the model store​ * Generates anomaly scores per log​ * Sends detected anomalies to monitoring services -#### Monitoring +### Monitoring * Detected anomalies are published to an S3 bucket, directory or a Kafka topic. * Output can be integrated with a monitoring tool. diff --git a/docs/source/getting_started.md b/docs/source/getting_started.md index b4d2b04cab..0b5a434875 100644 --- a/docs/source/getting_started.md +++ b/docs/source/getting_started.md @@ -368,36 +368,6 @@ Commands: trigger Buffer data until the previous stage has completed. validate Validate pipeline output for testing. ``` - -And for the AE pipeline: - -``` -$ morpheus run pipeline-ae --help -Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 [ARGS]...]... - - - -Commands: - add-class Add detected classifications to each message. - add-scores Add probability scores to each message. - buffer (Deprecated) Buffer results. - delay (Deprecated) Delay results for a certain duration. - filter Filter message by a classification threshold. - from-azure Source stage is used to load Azure Active Directory messages. - from-cloudtrail Load messages from a CloudTrail directory. - from-duo Source stage is used to load Duo Authentication messages. - inf-pytorch Perform inference with PyTorch. - inf-triton Perform inference with Triton Inference Server. - monitor Display throughput numbers at a specific point in the pipeline. - preprocess Prepare Autoencoder input DataFrames for inference. - serialize Includes & excludes columns from messages. - timeseries Perform time series anomaly detection and add prediction. - to-file Write all messages to a file. - to-kafka Write all messages to a Kafka cluster. - train-ae Train an Autoencoder model on incoming data. - trigger Buffer data until the previous stage has completed. - validate Validate pipeline output for testing. -``` Note: The available commands for different types of pipelines are not the same. This means that the same stage, when used in different pipelines, may have different options. Check the CLI help for the most up-to-date information during development. ## Next Steps diff --git a/docs/source/stages/morpheus_stages.md b/docs/source/stages/morpheus_stages.md index db2d533606..f574271a08 100644 --- a/docs/source/stages/morpheus_stages.md +++ b/docs/source/stages/morpheus_stages.md @@ -44,19 +44,15 @@ Stages are the building blocks of Morpheus pipelines. Below is a list of the mos ## Inference -- Auto Encoder Inference Stage {py:class}`~morpheus.stages.inference.auto_encoder_inference_stage.AutoEncoderInferenceStage` PyTorch inference stage used for Auto Encoder pipeline mode. - PyTorch Inference Stage {py:class}`~morpheus.stages.inference.pytorch_inference_stage.PyTorchInferenceStage` PyTorch inference stage used for most pipeline modes with the exception of Auto Encoder. - Triton Inference Stage {py:class}`~morpheus.stages.inference.triton_inference_stage.TritonInferenceStage` Inference stage which utilizes a [Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server). ## Input - App Shield Source Stage {py:class}`~morpheus.stages.input.appshield_source_stage.AppShieldSourceStage` Load App Shield messages from one or more plugins into a DataFrame. -- Azure Source Stage {py:class}`~morpheus.stages.input.azure_source_stage.AzureSourceStage` Load Azure Active Directory messages. -- Cloud Trail Source Stage {py:class}`~morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage` Load messages from a CloudTrail directory. - Control Message File Source Stage {py:class}`~morpheus.stages.input.control_message_file_source_stage.ControlMessageFileSourceStage` Receives control messages from different sources specified by a list of (fsspec)[https://filesystem-spec.readthedocs.io/en/latest/api.html?highlight=open_files#fsspec.open_files] strings. - Control Message Kafka Source Stage {py:class}`~morpheus.stages.input.control_message_kafka_source_stage.ControlMessageKafkaSourceStage` Load control messages from a Kafka cluster. - Databricks Delta Lake Source Stage {py:class}`~morpheus.stages.input.databricks_deltalake_source_stage.DataBricksDeltaLakeSourceStage` Source stage used to load messages from a DeltaLake table. -- Duo Source Stage {py:class}`~morpheus.stages.input.duo_source_stage.DuoSourceStage` Load Duo Authentication messages. - File Source Stage {py:class}`~morpheus.stages.input.file_source_stage.FileSourceStage` Load messages from a file. - HTTP Client Source Stage {py:class}`~morpheus.stages.input.http_client_source_stage.HttpClientSourceStage` Poll a remote HTTP server for incoming data. - HTTP Server Source Stage {py:class}`~morpheus.stages.input.http_server_source_stage.HttpServerSourceStage` Start an HTTP server and listens for incoming requests on a specified endpoint. @@ -92,7 +88,5 @@ Stages are the building blocks of Morpheus pipelines. Below is a list of the mos - Deserialize Stage {py:class}`~morpheus.stages.preprocess.deserialize_stage.DeserializeStage` Partition messages based on the `pipeline_batch_size` parameter of the pipeline's `morpheus.config.Config` object. - Drop Null Stage {py:class}`~morpheus.stages.preprocess.drop_null_stage.DropNullStage` Drop null data entries from a DataFrame. -- Preprocess AE Stage {py:class}`~morpheus.stages.preprocess.preprocess_ae_stage.PreprocessAEStage` Prepare Autoencoder input DataFrames for inference. - Preprocess FIL Stage {py:class}`~morpheus.stages.preprocess.preprocess_fil_stage.PreprocessFILStage` Prepare FIL input DataFrames for inference. - Preprocess NLP Stage {py:class}`~morpheus.stages.preprocess.preprocess_nlp_stage.PreprocessNLPStage` Prepare NLP input DataFrames for inference. -- Train AE Stage {py:class}`~morpheus.stages.preprocess.train_ae_stage.TrainAEStage` Train an Autoencoder model on incoming data. diff --git a/examples/digital_fingerprinting/README.md b/examples/digital_fingerprinting/README.md deleted file mode 100644 index d9296eb5f6..0000000000 --- a/examples/digital_fingerprinting/README.md +++ /dev/null @@ -1,47 +0,0 @@ - - -# Digital Fingerprinting (DFP) in Morpheus - -## Organization - -The DFP example workflows in Morpheus are designed to scale up to company wide workloads and handle several different log types which resulted in a large number of moving parts to handle the various services and configuration options. To simplify things, the DFP workflow is provided as two separate examples: a simple, "starter" pipeline for new users and a complex, "production" pipeline for full scale deployments. While these two examples both perform the same general tasks, they do so in very different ways. The following is a breakdown of the differences between the two examples. - -### The "Starter" Example - -This example is designed to simplify the number of stages and components and provided a fully contained workflow in a single pipeline. - -Key Differences: - * A single pipeline which performs both training and inference - * Requires no external services - * Can be run from the Morpheus CLI - - -### The "Production" Example - -This example is designed to illustrate a full-scale, production-ready, DFP deployment in Morpheus. It contains all of the necessary components (such as a model store), to allow multiple Morpheus pipelines to communicate at a scale that can handle the workload of an entire company. - -Key Differences: - * Multiple pipelines are specialized to perform either training or inference - * Requires setting up a model store to allow the training and inference pipelines to communicate - * Organized into a `docker compose` deployment for easy startup - * Contains a Jupyter notebook service to ease development and debugging - * Can be deployed to Kubernetes using provided Helm charts - * Uses many customized stages to maximize performance. - -## Getting Started - -Guides for each of the two examples can be found in their respective directories: [The Starter Example](./starter/README.md) and [The Production Example](./production/README.md) diff --git a/examples/digital_fingerprinting/production/README.md b/examples/digital_fingerprinting/production/README.md index 289634790e..69668db317 100644 --- a/examples/digital_fingerprinting/production/README.md +++ b/examples/digital_fingerprinting/production/README.md @@ -19,9 +19,9 @@ limitations under the License. This example is designed to illustrate a full-scale, production-ready, DFP deployment in Morpheus. It contains all of the necessary components (such as a model store), to allow multiple Morpheus pipelines to communicate at a scale that can handle the workload of an entire company. -Key Differences: +Key Features: * Multiple pipelines are specialized to perform either training or inference - * Requires setting up a model store to allow the training and inference pipelines to communicate + * Uses a model store to allow the training and inference pipelines to communicate * Organized into a `docker compose` deployment for easy startup * Contains a Jupyter notebook service to ease development and debugging * Can be deployed to Kubernetes using provided Helm charts diff --git a/examples/digital_fingerprinting/starter/README.md b/examples/digital_fingerprinting/starter/README.md deleted file mode 100644 index 89c2e60a66..0000000000 --- a/examples/digital_fingerprinting/starter/README.md +++ /dev/null @@ -1,304 +0,0 @@ - - -> **Warning**: This example is currently broken and fails with a Segmentation fault [#1641](https://github.com/nv-morpheus/Morpheus/issues/1641) - -# "Starter" Digital Fingerprinting Pipeline - -We show here how to set up and run the DFP pipeline for three log types: CloudTrail, Duo, and Azure. Each of these log types uses a built-in source stage that handles that specific data format. New source stages can be added to allow the DFP pipeline to process different log types. All stages after the source stages are identical across all log types but can be configured differently via pipeline or stage configuration options. - -## Environment Setup - -Follow the instructions [here](../../../docs/source/developer_guide/contributing.md) to set up your development environment in either a Docker container or Conda environment. - -## Morpheus CLI - -DFP pipelines can be constructed and run using the Morpheus CLI command `morpheus run pipeline-ae ...` - -Use `--help` to display information about the autoencoder pipeline command line options: - -``` -morpheus run pipeline-ae --help - -Usage: morpheus run pipeline-ae [OPTIONS] COMMAND1 [ARGS]... [COMMAND2 - [ARGS]...]... - - Configure and run the pipeline. To configure the pipeline, list the stages - in the order that data should flow. The output of each stage will become the - input for the next stage. For example, to read, classify and write to a - file, the following stages could be used - - pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model - --server_url=localhost:8001 filter --threshold=0.5 to-file --filename=classifications.json - - Pipelines must follow a few rules: - 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` - 2. A `deserialize` stage must be placed between the source stages and the rest of the pipeline - 3. Only one inference stage can be used. Zero is also fine - 4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz` - -Options: - --columns_file FILE [default: ./morpheus/data/columns_ae_cloudtrail.txt] - --labels_file FILE Specifies a file to read labels from in - order to convert class IDs into labels. A - label file is a simple text file where each - line corresponds to a label. If unspecified, - only a single output label is created for - FIL - --userid_column_name TEXT Which column to use as the User ID. - [default: userIdentityaccountId; required] - --userid_filter TEXT Specifying this value will filter all - incoming data to only use rows with matching - User IDs. Which column is used for the User - ID is specified by `userid_column_name` - --feature_scaler TEXT Autoencoder feature scaler [default: - standard] - --use_generic_model BOOLEAN Whether to use a generic model when user does - not have minimum number of training rows - [default: False] - --viz_file FILE Save a visualization of the pipeline at the - specified location - --help Show this message and exit. - -Commands: - add-class Add detected classifications to each message - add-scores Add probability scores to each message - buffer (Deprecated) Buffer results - delay (Deprecated) Delay results for a certain duration - filter Filter message by a classification threshold - from-azure Source stage is used to load Azure Active Directory messages. - from-cloudtrail Load messages from a CloudTrail directory - from-duo Source stage is used to load Duo Authentication messages. - gen-viz (Deprecated) Write out visualization data frames - inf-pytorch Perform inference with PyTorch - inf-triton Perform inference with Triton - monitor Display throughput numbers at a specific point in the - pipeline - preprocess Convert messages to tokens - serialize Include & exclude columns from messages - timeseries Perform time series anomaly detection and add prediction. - to-file Write all messages to a file - to-kafka Write all messages to a Kafka cluster - train-ae Deserialize source data from JSON - validate Validates pipeline output against an expected output -``` -The commands above correspond to the Morpheus stages that can be used to construct your DFP pipeline. Options are available to configure pipeline and stages. -The following table shows mapping between the main Morpheus CLI commands and underlying Morpheus Python stage classes: - -| CLI Command | Stage Class | Python File | -| ------------------| ----------------------------| ----------------------------------------------------------- -| `from-azure` | `AzureSourceStage` | `morpheus/stages/input/azure_source_stage.py` -| `from-cloudtrail` | `CloudTrailSourceStage` | `morpheus/stages/input/clout_trail_source_stage.py` -| `from-duo` | `DuoSourceStage` | `morpheus/stages/input/duo_source_stage.py` -| `train-ae` | `TrainAEStage` | `morpheus/stages/preprocess/train_ae_stage.py` -| `preprocess` | `PreprocessAEStage` | `morpheus/stages/preprocess/preprocess_ae_stage.py` -| `inf-pytorch` | `AutoEncoderInferenceStage` | `morpheus/stages/inference/auto_encoder_inference_stage.py` -| `add-scores` | `AddScoresStage` | `morpheus/stages/postprocess/add_scores_stage.py` -| `serialize` | `SerializeStage` | `morpheus/stages/postprocess/serialize_stage.py` -| `to-file ` | `WriteToFileStage` | `morpheus/stages/output/write_to_file_stage.py` - - -## Morpheus DFP Stages - -**Source stages** - These include `AzureSourceStage`, `CloudTrailSourceStage` and `DuoSourceStage`. They are responsible for reading log files that match provided `--input_glob` (for example `/duo_logs/*.json`). Data is grouped by user so that each batch processed by the pipeline will only contain rows corresponding to a single user. Feature engineering also happens in this stage. All DFP source stages must extend `AutoencoderSourceStage` and implement the `files_to_dfs_per_user` abstract method. Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input data pass through to next stage. - -**Preprocessing stages** - -`TrainAEStage` can either train user models using data matching a provided `--train_data_glob` or load pre-trained models from file using `--pretrained_filename`. When using `--train_data_glob`, user models can be saved using the `--models_output_filename` option. The `--source_stage_class` must also be used with `--train_data_glob` so that the training stage knows how to read the training data. The autoencoder implementation used for user model training can be found [here](https://github.com/nv-morpheus/dfencoder). The following are the available CLI options for the `TrainAEStage` (train-ae): - -| Option | Description -| -------------------------| --------------------------------------------------------- -| `pretrained_filename` | File path to pickled user models saved from previous training run using `--models_output_filename`. -| `train_data_glob` | Glob path to training data. -| `source_stage_class` | Source stage so that training stage knows how to read/parse training data. -| `train_epochs` | Number of training epochs. Default is 25. -| `min_train_rows` | Minimum number of training rows required to train user model. Default is 300. -| `train_max_history` | Maximum number of training rows per user. Default is 1000. -| `seed` | When not None, ensure random number generators are seeded with `seed` to control reproducibility of user model. -| `sort_glob` | If true the list of files matching `input_glob` will be processed in sorted order. Default is False. -| `models_output_filename` | Can be used with `--train_data_glob` to save trained user models to file using provided file path. Models can be loaded later using `--pretrained_filename`. - -The `PreprocessAEStage` is responsible for creating a Morpheus message that contains everything needed by the inference stage. For DFP inference, this stage must pass a `ControlMessage` to the inference stage. Each message will correspond to a single user and include the input feature columns, the user's model and training data anomaly scores. - -**Inference stage** - `AutoEncoderInferenceStage` calculates anomaly scores (specifically, reconstruction loss) and z-scores for each user input dataset. - -**Post-processing stage** - The DFP pipeline uses the `AddScoresStage` for post-processing to add anomaly scores and z-scores from previous inference stage with matching labels. - -**Serialize stage** - `SerializeStage` is used to convert `ControlMessage` from previous stage to a `MessageMeta` to make it suitable for output (for example writing to file or Kafka). - -**Write stage** - `WriteToFileStage` writes input data with inference results to an output file path. - -## Download DFP Example Data from S3 - -``` -pip install s3fs -``` - -``` -./examples/digital_fingerprinting/fetch_example_data.py all -``` - -Azure training data will be saved to `examples/data/dfp/azure-training-data`, inference data to `examples/data/dfp/azure-inference-data`. -Duo training data will be saved to `examples/data/dfp/duo-training-data`, inference data to `examples/data/dfp/duo-inference-data`. - -## CloudTrail DFP Pipeline - -Run the following in your Morpheus container to start the CloudTrail DFP pipeline: - -``` -morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ - pipeline-ae \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ - --userid_column_name=userIdentitysessionContextsessionIssueruserName \ - --userid_filter=user123 \ - --feature_scaler=standard \ - from-cloudtrail \ - --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --max_files=200 \ - train-ae \ - --train_data_glob=models/datasets/training-data/dfp-cloudtrail-*.csv \ - --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage \ - --seed=42 \ - preprocess \ - inf-pytorch \ - add-scores \ - serialize \ - to-file --filename=./cloudtrail-dfp-detections.csv --overwrite -``` - -## Duo DFP Pipeline - -The following pipeline trains user models from downloaded training data and saves user models to file. Pipeline then uses these models to run inference -on downloaded inference data. Inference results are written to `duo-detections.csv`. -``` -morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ - pipeline-ae \ - --columns_file=morpheus/data/columns_ae_duo.txt \ - --userid_column_name=username \ - --feature_scaler=standard \ - from-duo \ - --input_glob=examples/data/dfp/duo-inference-data/*.json \ - --max_files=200 \ - monitor --description='Input rate' \ - train-ae \ - --train_data_glob=examples/data/dfp/duo-training-data/*.json \ - --source_stage_class=morpheus.stages.input.duo_source_stage.DuoSourceStage \ - --seed=42 \ - --models_output_filename=models/dfp-models/duo_ae_user_models.pkl \ - preprocess \ - inf-pytorch \ - monitor --description='Inference rate' --unit inf \ - add-scores \ - serialize \ - to-file --filename=./duo-detections.csv --overwrite -``` - -The following example shows how we can load pre-trained user models from the file (`models/dfp-models/duo_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/duo`. Inference results are written to `duo-detections.csv`. -``` -morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ - pipeline-ae \ - --columns_file=morpheus/data/columns_ae_duo.txt \ - --userid_column_name=username \ - --feature_scaler=standard \ - from-duo \ - --input_glob=examples/data/dfp/duo-inference-data/*.json \ - --max_files=200 \ - monitor --description='Input rate' \ - train-ae \ - --pretrained_filename=models/dfp-models/duo_ae_user_models.pkl \ - preprocess \ - inf-pytorch \ - monitor --description='Inference rate' --unit inf \ - add-scores \ - serialize \ - to-file --filename=./duo-detections.csv --overwrite -``` - -## Azure DFP Pipeline - -The following pipeline trains user models from downloaded training data and saves user models to file. Pipeline then uses these models to run inference -on downloaded inference data. Inference results are written to `azure-detections.csv`. -``` -morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ - pipeline-ae \ - --columns_file=morpheus/data/columns_ae_azure.txt \ - --userid_column_name=userPrincipalName \ - --feature_scaler=standard \ - from-azure \ - --input_glob=examples/data/dfp/azure-inference-data/*.json \ - --max_files=200 \ - train-ae \ - --train_data_glob=examples/data/dfp/azure-training-data/*.json \ - --source_stage_class=morpheus.stages.input.azure_source_stage.AzureSourceStage \ - --seed=42 \ - --models_output_filename=models/dfp-models/azure_ae_user_models.pkl \ - preprocess \ - inf-pytorch \ - monitor --description='Inference rate' --unit inf \ - add-scores \ - serialize \ - to-file --filename=./azure-detections.csv --overwrite -``` - -The following example shows how we can load pre-trained user models from the file (`models/dfp-models/azure_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/azure`. Inference results are written to `azure-detections.csv`. -``` -morpheus --log_level=DEBUG \ - run --num_threads=1 --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=False \ - pipeline-ae \ - --columns_file=morpheus/data/columns_ae_azure.txt \ - --userid_column_name=userPrincipalName \ - --feature_scaler=standard \ - from-azure \ - --input_glob=examples/data/dfp/azure-inference-data/*.json \ - --max_files=200 \ - train-ae \ - --pretrained_filename=models/dfp-models/azure_ae_user_models.pkl \ - preprocess \ - inf-pytorch \ - monitor --description='Inference rate' --unit inf \ - add-scores \ - serialize \ - to-file --filename=./azure-detections.csv --overwrite -``` - - -## Using Morpheus Python API - -The DFP pipelines can also be constructed and run via the Morpheus Python API. An [example](./run_cloudtrail_dfp.py) is included for the CloudTrail DFP pipeline. The following are some commands to -run the example. - -Train user models from files in `models/datasets/training-data/dfp-cloudtrail-*.csv` and saves user models to file. Pipeline then uses these models to run inference on CloudTrail validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. -``` -python ./examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ - --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --train_data_glob=models/datasets/training-data/dfp-*.csv \ - --models_output_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ - --output_file ./cloudtrail-dfp-results.csv -``` - -Here we load pre-trained user models from the file (`models/dfp-models/cloudtrail_ae_user_models.pkl`) we created in the previous example. Pipeline then uses these models to run inference on validation data in `models/datasets/validation-data/dfp-cloudtrail-*-input.csv`. Inference results are written to `cloudtrail-dfp-results.csv`. -``` -python ./examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py \ - --columns_file=morpheus/data/columns_ae_cloudtrail.txt \ - --input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv \ - --pretrained_filename=models/dfp-models/cloudtrail_ae_user_models.pkl \ - --output_file=./cloudtrail-dfp-results.csv -``` diff --git a/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py b/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py deleted file mode 100644 index 835b3e2809..0000000000 --- a/examples/digital_fingerprinting/starter/run_cloudtrail_dfp.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Executes a pipeline that trains an autoencoder and then uses it to detect anomalies in the same data.""" - -import logging -import os - -import click - -from morpheus.config import AEFeatureScalar -from morpheus.config import Config -from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig -from morpheus.config import PipelineModes -from morpheus.pipeline import LinearPipeline -from morpheus.stages.general.monitor_stage import MonitorStage -from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage -from morpheus.stages.input.cloud_trail_source_stage import CloudTrailSourceStage -from morpheus.stages.output.write_to_file_stage import WriteToFileStage -from morpheus.stages.postprocess.add_scores_stage import AddScoresStage -from morpheus.stages.postprocess.serialize_stage import SerializeStage -from morpheus.stages.preprocess.preprocess_ae_stage import PreprocessAEStage -from morpheus.stages.preprocess.train_ae_stage import TrainAEStage -from morpheus.utils.file_utils import load_labels_file -from morpheus.utils.logger import configure_logging - - -@click.command() -@click.option( - "--num_threads", - default=len(os.sched_getaffinity(0)), - type=click.IntRange(min=1), - help="Number of internal pipeline threads to use", -) -@click.option( - "--pipeline_batch_size", - default=1024, - type=click.IntRange(min=1), - help=("Internal batch size for the pipeline. Can be much larger than the model batch size. " - "Also used for Kafka consumers"), -) -@click.option( - "--model_max_batch_size", - default=1024, - type=click.IntRange(min=1), - help="Max batch size to use for the model", -) -@click.option( - "--columns_file", - type=click.Path(exists=True, readable=True), - required=True, - help="Feature columns file", -) -@click.option( - "--input_glob", - type=str, - required=True, - help="Inference input glob", -) -@click.option( - "--train_data_glob", - type=str, - required=False, - help="Train data glob", -) -@click.option( - "--pretrained_filename", - type=click.Path(exists=True, readable=True), - required=False, - help="File with pre-trained user models", -) -@click.option( - "--models_output_filename", - help="The path to the file where the inference output will be saved.", -) -@click.option( - "--output_file", - default="./cloudtrail-detections.csv", - help="The path to the file where the inference output will be saved.", -) -def run_pipeline(num_threads, - pipeline_batch_size, - model_max_batch_size, - columns_file, - input_glob, - train_data_glob, - pretrained_filename, - models_output_filename, - output_file): - """Configure and run the pipeline.""" - configure_logging(log_level=logging.DEBUG) - - CppConfig.set_should_use_cpp(False) - - config = Config() - config.mode = PipelineModes.AE - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = "userIdentitysessionContextsessionIssueruserName" - config.ae.feature_scaler = AEFeatureScalar.STANDARD - config.ae.feature_columns = load_labels_file(columns_file) - config.num_threads = num_threads - config.pipeline_batch_size = pipeline_batch_size - config.model_max_batch_size = model_max_batch_size - config.class_labels = ["reconstruct_loss", "zscore"] - - # Create a pipeline object - pipeline = LinearPipeline(config) - - # Add a source stage - pipeline.set_source(CloudTrailSourceStage(config, input_glob=input_glob)) - - # Add a training stage - pipeline.add_stage( - TrainAEStage(config, - pretrained_filename=pretrained_filename, - train_data_glob=train_data_glob, - source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - models_output_filename=models_output_filename, - seed=42, - sort_glob=True)) - - # Add a preprocessing stage - pipeline.add_stage(PreprocessAEStage(config)) - - # Add a inference stage - pipeline.add_stage(AutoEncoderInferenceStage(config)) - - # Add anomaly scores and z-scores to each message - pipeline.add_stage(AddScoresStage(config)) - - # Add serialize stage - pipeline.add_stage(SerializeStage(config)) - - # Add a write file stage - pipeline.add_stage(WriteToFileStage(config, filename=output_file, overwrite=True)) - - pipeline.add_stage(MonitorStage(config, description="Postprocessing rate")) - - # Run the pipeline - pipeline.run() - - -if __name__ == "__main__": - # The click decordators add all of the needed arguments to the `run_pipeline` function but pylint doesn't know that - # pylint: disable=no-value-for-parameter - run_pipeline() diff --git a/morpheus.code-workspace b/morpheus.code-workspace index 0ec937642b..24f34efff7 100644 --- a/morpheus.code-workspace +++ b/morpheus.code-workspace @@ -140,68 +140,6 @@ "subProcess": true, "type": "debugpy" }, - { - "args": [ - "--log_level=DEBUG", - "run", - "--num_threads=1", - "--pipeline_batch_size=1024", - "--model_max_batch_size=1024", - "--use_cpp=False", - "pipeline-ae", - "--columns_file=python/morpheus/morpheus/data/columns_ae_cloudtrail.txt", - "--userid_column_name=userIdentitysessionContextsessionIssueruserName", - "--userid_filter=user123", - "--timestamp_column_name=event_dt", - "from-cloudtrail", - "--input_glob=models/datasets/validation-data/dfp-cloudtrail-*-input.csv", - "--max_files=200", - "train-ae", - "--train_data_glob=models/datasets/training-data/dfp-*.csv", - "--source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - "--seed=42", - "preprocess", - "inf-pytorch", - "add-scores", - "timeseries", - "--resolution=1m", - "--zscore_threshold=8.0", - "--hot_start", - "monitor", - "--description", - "Inference Rate", - "--unit", - "inf", - "validate", - "--val_file_name=models/datasets/validation-data/dfp-cloudtrail-user123-validation-data-output.csv", - "--results_file_name=./.tmp/validation_results-ae.json", - "--index_col=_index_", - "--exclude=event_dt", - "--rel_tol=0.1", - "--overwrite", - "serialize", - // "--include", - // "timestamp", - // "--exclude", - // "^_ts_", - // "--exclude", - // "^nvidia_smi_log", - // "to-kafka", - // "--output_topic", - // "inference_output", - "to-file", - "--filename=./.tmp/detections.csv", - "--overwrite" - ], - "console": "integratedTerminal", - "cwd": "${workspaceFolder}", - "justMyCode": false, - "name": "Python: Run Pipeline (AE)", - "program": "${workspaceFolder}/python/morpheus/morpheus/cli/run.py", - "request": "launch", - "subProcess": true, - "type": "debugpy" - }, { "args": [ "--log_level=DEBUG", @@ -493,97 +431,6 @@ }, "type": "cppdbg" }, - { - "MIMode": "gdb", - "args": [ - "./python/morpheus/morpheus/cli.py", - "--log_level=DEBUG", - "run", - "--num_threads=1", - "--pipeline_batch_size=128", - "--model_max_batch_size=128", - "--use_cpp=False", - "pipeline-ae", - // "--ae_path=models/hammah-models/hammah-role-g-20211017.pkl", - "--ae_path=../data/ae_model.pkl", - "from-cloudtrail", - "--input_glob=models/datasets/validation-data/dfp-cloudtrail-role-g-validation-data.csv", - // "--input_glob=./data/red_team.csv", - "--max_files=200", - "--iterative", - "deserialize", - "preprocess", - "inf-triton", - "--model_name=autoencoder-onnx", - "--server_url=localhost:8001", - "timeseries", - "--resolution=10m", - "monitor", - "--description", - "Inference Rate", - "--smoothing=0.001", - "--unit", - "inf" - // "add-class", - // "filter", - // "serialize", - // "--include", - // "timestamp", - // "--exclude", - // "^_ts_", - // "--exclude", - // "^nvidia_smi_log", - // "to-kafka", - // "--output_topic", - // "inference_output", - // "to-file", - // "--filename=./.tmp/detections.json", - // "--overwrite", - ], - "cwd": "${workspaceFolder}", - "environment": [ - { - "name": "MORPHEUS_ROOT", - "value": "${workspaceFolder}" - }, - { - "name": "GLOG_v", - "value": "10" - }, - { - "name": "CUDA_LAUNCH_BLOCKING", - "value": "1" - } - ], - "externalConsole": false, - "miDebuggerPath": "gdb", - "name": "Debug MRC from Python (Morpheus-AE)", - "program": "python", - "request": "launch", - "setupCommands": [ - { - "description": "Enable pretty-printing for gdb", - "ignoreFailures": true, - "text": "-enable-pretty-printing" - }, - { - "description": "Skip stdio-common files", - "text": "-interpreter-exec console \"skip -gfi **/bits/*.h\"" - } - ], - "sourceFileMap": { - "${workspaceFolder}": { - "editorPath": "${workspaceFolder}", - "useForBreakpoints": "true" - } - }, - "stopAtEntry": false, - "symbolLoadInfo": { - "exceptionList": "libmrc*.so;cudf_helpers.*;executor.*;morpheus.*;node.*;options.*;pipeline.*;segment.*;subscriber.*;stages.*;messages.*;common*.so", - "loadAll": false - }, - "type": "cppdbg" - }, { "MIMode": "gdb", "args": [ diff --git a/python/morpheus/morpheus/cli/commands.py b/python/morpheus/morpheus/cli/commands.py index e7df1d3b75..232da7dea0 100644 --- a/python/morpheus/morpheus/cli/commands.py +++ b/python/morpheus/morpheus/cli/commands.py @@ -25,16 +25,12 @@ from morpheus.cli.stage_registry import LazyStageInfo from morpheus.cli.utils import MorpheusRelativePath from morpheus.cli.utils import get_config_from_ctx -from morpheus.cli.utils import get_enum_keys from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_pipeline_from_ctx from morpheus.cli.utils import load_labels_file -from morpheus.cli.utils import parse_enum from morpheus.cli.utils import parse_log_level from morpheus.cli.utils import prepare_command -from morpheus.config import AEFeatureScalar from morpheus.config import Config -from morpheus.config import ConfigAutoEncoder from morpheus.config import ConfigFIL from morpheus.config import ConfigOnnxToTRT from morpheus.config import CppConfig @@ -448,108 +444,6 @@ def pipeline_fil(ctx: click.Context, **kwargs): return p -@click.group(chain=True, - short_help="Run the inference pipeline with an AutoEncoder model", - no_args_is_help=True, - cls=PluginGroup, - pipeline_mode=PipelineModes.AE) -@click.option('--columns_file', - required=True, - default=None, - type=MorpheusRelativePath(dir_okay=False, exists=True, file_okay=True, resolve_path=True), - help=("Specifies a file to read column features.")) -@click.option('--labels_file', - default=None, - type=MorpheusRelativePath(dir_okay=False, exists=True, file_okay=True, resolve_path=True), - help=("Specifies a file to read labels from in order to convert class IDs into labels. " - "A label file is a simple text file where each line corresponds to a label. ")) -@click.option('--userid_column_name', - type=str, - default="userIdentityaccountId", - required=True, - help=("Which column to use as the User ID.")) -@click.option('--userid_filter', - type=str, - default=None, - help=("Specifying this value will filter all incoming data to only use rows with matching User IDs. " - "Which column is used for the User ID is specified by `userid_column_name`")) -@click.option('--feature_scaler', - type=click.Choice(get_enum_keys(AEFeatureScalar), case_sensitive=False), - default=AEFeatureScalar.STANDARD.name, - callback=functools.partial(parse_enum, enum_class=AEFeatureScalar, case_sensitive=False), - help=("Autoencoder feature scaler")) -@click.option('--use_generic_model', - is_flag=True, - type=bool, - help=("Whether to use a generic model when user does not have minimum number of training rows")) -@click.option('--viz_file', - default=None, - type=click.Path(dir_okay=False, writable=True), - help="Save a visualization of the pipeline at the specified location") -@click.option('--viz_direction', - default="LR", - type=click.Choice(RANKDIR_CHOICES, case_sensitive=False), - help=("Set the direction for the Graphviz pipeline diagram, " - "ignored unless --viz_file is also specified.")) -@click.option('--timestamp_column_name', - type=str, - default="timestamp", - required=True, - help=("Which column to use as the timestamp.")) -@prepare_command() -def pipeline_ae(ctx: click.Context, **kwargs): - """ - Configure and run the pipeline. To configure the pipeline, list the stages in the order that data should flow. The - output of each stage will become the input for the next stage. For example, to read, classify and write to a file, - the following stages could be used - - pipeline from-file --filename=my_dataset.json deserialize preprocess inf-triton --model_name=my_model - --server_url=localhost:8001 filter --threshold=0.5 to-file --filename=classifications.json - - Pipelines must follow a few rules: - 1. Data must originate in a source stage. Current options are `from-file` or `from-kafka` - 2. A `deserialize` stage must be placed between the source stages and the rest of the pipeline - 3. Only one inference stage can be used. Zero is also fine - 4. The following stages must come after an inference stage: `add-class`, `filter`, `gen-viz` - - """ - - click.secho("Configuring Pipeline via CLI", fg="green") - - config = get_config_from_ctx(ctx) - config.mode = PipelineModes.AE - - if CppConfig.get_should_use_cpp(): - logger.warning("C++ is disabled for AutoEncoder pipelines at this time.") - CppConfig.set_should_use_cpp(False) - - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = kwargs["userid_column_name"] - config.ae.timestamp_column_name = kwargs["timestamp_column_name"] - config.ae.feature_scaler = kwargs["feature_scaler"] - config.ae.use_generic_model = kwargs["use_generic_model"] - config.ae.feature_columns = load_labels_file(kwargs["columns_file"]) - logger.debug("Loaded columns. Current columns: [%s]", str(config.ae.feature_columns)) - - if ("labels_file" in kwargs and kwargs["labels_file"] is not None): - config.class_labels = load_labels_file(kwargs["labels_file"]) - logger.debug("Loaded labels file. Current labels: [%s]", str(config.class_labels)) - else: - # Use default labels - config.class_labels = ["reconstruct_loss", "zscore"] - - if ("userid_filter" in kwargs): - config.ae.userid_filter = kwargs["userid_filter"] - - logger.info("Filtering all users except ID: '%s'", str(config.ae.userid_filter)) - - from morpheus.pipeline import LinearPipeline - - p = ctx.obj["pipeline"] = LinearPipeline(config) - - return p - - @click.group(chain=True, short_help="Run a custom inference pipeline without a specific model type", no_args_is_help=True, @@ -622,7 +516,6 @@ def pipeline_other(ctx: click.Context, **kwargs): @pipeline_nlp.result_callback() @pipeline_fil.result_callback() -@pipeline_ae.result_callback() @pipeline_other.result_callback() @click.pass_context def post_pipeline(ctx: click.Context, *args, **kwargs): @@ -647,12 +540,9 @@ def post_pipeline(ctx: click.Context, *args, **kwargs): # Manually create the subcommands for each command (necessary since commands can be used on multiple groups) run.add_command(pipeline_nlp) run.add_command(pipeline_fil) -run.add_command(pipeline_ae) run.add_command(pipeline_other) -ALL = (PipelineModes.AE, PipelineModes.NLP, PipelineModes.FIL, PipelineModes.OTHER) -NOT_AE = (PipelineModes.NLP, PipelineModes.FIL, PipelineModes.OTHER) -AE_ONLY = (PipelineModes.AE, ) +ALL = (PipelineModes.NLP, PipelineModes.FIL, PipelineModes.OTHER) FIL_ONLY = (PipelineModes.FIL, ) NLP_ONLY = (PipelineModes.NLP, ) @@ -661,37 +551,28 @@ def post_pipeline(ctx: click.Context, *args, **kwargs): add_command("add-scores", "morpheus.stages.postprocess.add_scores_stage.AddScoresStage", modes=ALL) add_command("buffer", "morpheus.stages.general.buffer_stage.BufferStage", modes=ALL) add_command("delay", "morpheus.stages.general.delay_stage.DelayStage", modes=ALL) -add_command("deserialize", "morpheus.stages.preprocess.deserialize_stage.DeserializeStage", modes=NOT_AE) -add_command("dropna", "morpheus.stages.preprocess.drop_null_stage.DropNullStage", modes=NOT_AE) +add_command("deserialize", "morpheus.stages.preprocess.deserialize_stage.DeserializeStage", modes=ALL) +add_command("dropna", "morpheus.stages.preprocess.drop_null_stage.DropNullStage", modes=ALL) add_command("filter", "morpheus.stages.postprocess.filter_detections_stage.FilterDetectionsStage", modes=ALL) add_command("from-arxiv", "morpheus.stages.input.arxiv_source.ArxivSource", modes=ALL) -add_command("from-azure", "morpheus.stages.input.azure_source_stage.AzureSourceStage", modes=AE_ONLY) add_command("from-appshield", "morpheus.stages.input.appshield_source_stage.AppShieldSourceStage", modes=FIL_ONLY) -add_command("from-azure", "morpheus.stages.input.azure_source_stage.AzureSourceStage", modes=AE_ONLY) -add_command("from-cloudtrail", "morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", modes=AE_ONLY) add_command("from-databricks-deltalake", "morpheus.stages.input.databricks_deltalake_source_stage.DataBricksDeltaLakeSourceStage", modes=ALL) -add_command("from-duo", "morpheus.stages.input.duo_source_stage.DuoSourceStage", modes=AE_ONLY) -add_command("from-file", "morpheus.stages.input.file_source_stage.FileSourceStage", modes=NOT_AE) -add_command("from-kafka", "morpheus.stages.input.kafka_source_stage.KafkaSourceStage", modes=NOT_AE) +add_command("from-file", "morpheus.stages.input.file_source_stage.FileSourceStage", modes=ALL) +add_command("from-kafka", "morpheus.stages.input.kafka_source_stage.KafkaSourceStage", modes=ALL) add_command("from-http", "morpheus.stages.input.http_server_source_stage.HttpServerSourceStage", modes=ALL) add_command("from-http-client", "morpheus.stages.input.http_client_source_stage.HttpClientSourceStage", modes=ALL) add_command("from-rss", "morpheus.stages.input.rss_source_stage.RSSSourceStage", modes=ALL) add_command("gen-viz", "morpheus.stages.postprocess.generate_viz_frames_stage.GenerateVizFramesStage", modes=NLP_ONLY) -add_command("inf-identity", "morpheus.stages.inference.identity_inference_stage.IdentityInferenceStage", modes=NOT_AE) -add_command("inf-pytorch", - "morpheus.stages.inference.auto_encoder_inference_stage.AutoEncoderInferenceStage", - modes=AE_ONLY) -add_command("inf-pytorch", "morpheus.stages.inference.pytorch_inference_stage.PyTorchInferenceStage", modes=NOT_AE) -add_command("inf-triton", "morpheus.stages.inference.triton_inference_stage.TritonInferenceStage", modes=NOT_AE) -add_command("mlflow-drift", "morpheus.stages.postprocess.ml_flow_drift_stage.MLFlowDriftStage", modes=NOT_AE) +add_command("inf-identity", "morpheus.stages.inference.identity_inference_stage.IdentityInferenceStage", modes=ALL) +add_command("inf-pytorch", "morpheus.stages.inference.pytorch_inference_stage.PyTorchInferenceStage", modes=ALL) +add_command("inf-triton", "morpheus.stages.inference.triton_inference_stage.TritonInferenceStage", modes=ALL) +add_command("mlflow-drift", "morpheus.stages.postprocess.ml_flow_drift_stage.MLFlowDriftStage", modes=ALL) add_command("monitor", "morpheus.stages.general.monitor_stage.MonitorStage", modes=ALL) -add_command("preprocess", "morpheus.stages.preprocess.preprocess_ae_stage.PreprocessAEStage", modes=AE_ONLY) add_command("preprocess", "morpheus.stages.preprocess.preprocess_fil_stage.PreprocessFILStage", modes=FIL_ONLY) add_command("preprocess", "morpheus.stages.preprocess.preprocess_nlp_stage.PreprocessNLPStage", modes=NLP_ONLY) add_command("serialize", "morpheus.stages.postprocess.serialize_stage.SerializeStage", modes=ALL) -add_command("timeseries", "morpheus.stages.postprocess.timeseries_stage.TimeSeriesStage", modes=AE_ONLY) add_command("to-elasticsearch", "morpheus.stages.output.write_to_elasticsearch_stage.WriteToElasticsearchStage", modes=ALL) @@ -699,7 +580,6 @@ def post_pipeline(ctx: click.Context, *args, **kwargs): add_command("to-kafka", "morpheus.stages.output.write_to_kafka_stage.WriteToKafkaStage", modes=ALL) add_command("to-http", "morpheus.stages.output.http_client_sink_stage.HttpClientSinkStage", modes=ALL) add_command("to-http-server", "morpheus.stages.output.http_server_sink_stage.HttpServerSinkStage", modes=ALL) -add_command("train-ae", "morpheus.stages.preprocess.train_ae_stage.TrainAEStage", modes=AE_ONLY) add_command("trigger", "morpheus.stages.general.trigger_stage.TriggerStage", modes=ALL) add_command("validate", "morpheus.stages.postprocess.validation_stage.ValidationStage", modes=ALL) diff --git a/python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py b/python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py deleted file mode 100644 index 32f3c569ad..0000000000 --- a/python/morpheus/morpheus/stages/inference/auto_encoder_inference_stage.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing - -import cupy as cp -import numpy as np -import pandas as pd - -import morpheus._lib.messages as _messages -from morpheus.cli.register_stage import register_stage -from morpheus.config import Config -from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage -from morpheus.messages import ResponseMemoryAE -from morpheus.messages import TensorMemory -from morpheus.stages.inference.inference_stage import InferenceStage -from morpheus.stages.inference.inference_stage import InferenceWorker -from morpheus.utils.producer_consumer_queue import ProducerConsumerQueue - - -class _AutoEncoderInferenceWorker(InferenceWorker): - - def __init__(self, inf_queue: ProducerConsumerQueue, c: Config): - super().__init__(inf_queue) - - self._max_batch_size = c.model_max_batch_size - self._seq_length = c.feature_length - - self._feature_columns = c.ae.feature_columns - - def init(self): - - pass - - def build_output_message(self, msg: ControlMessage) -> ControlMessage: - """ - Create initial inference response message with result values initialized to zero. Results will be - set in message as each inference batch is processed. - - Parameters - ---------- - msg : `morpheus.messages.ControlMessage` - Batch of ControlMessage. - - Returns - ------- - `morpheus.messages.ControlMessage` - Response ControlMessage. - """ - - dims = self.calc_output_dims(msg) - output_dims = (msg.payload().count, *dims[1:]) - - output_message = ControlMessage(msg) - output_message.payload(msg.payload()) - output_message.tensors(_messages.TensorMemory(count=output_dims[0], tensors={"probs": cp.zeros(output_dims)})) - - return output_message - - def calc_output_dims(self, msg: ControlMessage) -> typing.Tuple: - # reconstruction loss and zscore - return (msg.tensors().count, 2) - - def process(self, batch: ControlMessage, callback: typing.Callable[[TensorMemory], None]): - """ - This function processes inference batch by using batch's model to calculate anomaly scores - and adding results to response. - - Parameters - ---------- - batch : `morpheus.messages.ControlMessage` - Batch of inference messages. - callback : typing.Callable[[`morpheus.pipeline.messages.TensorMemory`], None] - Inference callback. - - """ - - data = batch.payload().get_data(batch.payload().df.columns.intersection(self._feature_columns)) - - explain_cols = [x + "_z_loss" for x in self._feature_columns] + ["max_abs_z", "mean_abs_z"] - explain_df = pd.DataFrame(np.empty((batch.tensors().count, (len(self._feature_columns) + 2)), dtype=object), - columns=explain_cols) - - model = batch.get_metadata("model") - if model is not None: - rloss_scores = model.get_anomaly_score(data) - - results = model.get_results(data, return_abs=True) - scaled_z_scores = [col for col in results.columns if col.endswith('_z_loss')] - scaled_z_scores.extend(['max_abs_z', 'mean_abs_z']) - scaledz_df = results[scaled_z_scores] - for col in scaledz_df.columns: - explain_df[col] = scaledz_df[col] - - zscores = (rloss_scores - batch.get_metadata("train_scores_mean")) / batch.get_metadata("train_scores_std") - rloss_scores = rloss_scores.reshape((batch.tensors().count, 1)) - zscores = np.absolute(zscores) - zscores = zscores.reshape((batch.tensors().count, 1)) - else: - rloss_scores = np.empty((batch.tensors().count, 1)) - rloss_scores[:] = np.NaN - zscores = np.empty((batch.tensors().count, 1)) - zscores[:] = np.NaN - - ae_scores = np.concatenate((rloss_scores, zscores), axis=1) - - ae_scores = cp.asarray(ae_scores) - - mem = ResponseMemoryAE(count=batch.tensors().count, probs=ae_scores) - - mem.explain_df = explain_df - - callback(mem) - - -@register_stage("inf-pytorch", modes=[PipelineModes.AE]) -class AutoEncoderInferenceStage(InferenceStage): - """ - Perform inference with PyTorch. - """ - - def __init__(self, c: Config): - super().__init__(c) - - self._config = c - - def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> InferenceWorker: - - return _AutoEncoderInferenceWorker(inf_queue, self._config) - - @staticmethod - def _convert_one_response(output: ControlMessage, inf: ControlMessage, res: ResponseMemoryAE): - # Set the explainability and then call the base - res.explain_df.index = range(0, inf.payload().count) - for col in res.explain_df.columns: - inf.payload().set_data(col, res.explain_df[col]) - - return InferenceStage._convert_one_response(output=output, inf=inf, res=res) diff --git a/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py b/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py deleted file mode 100644 index 6675b3eacd..0000000000 --- a/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py +++ /dev/null @@ -1,329 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import typing -from abc import abstractmethod -from functools import partial - -import mrc -import pandas as pd -from mrc.core import operators as ops - -from morpheus.common import FileTypes -from morpheus.config import Config -from morpheus.messages import UserMessageMeta -from morpheus.pipeline.preallocator_mixin import PreallocatorMixin -from morpheus.pipeline.single_output_source import SingleOutputSource -from morpheus.pipeline.stage_schema import StageSchema -from morpheus.utils.directory_watcher import DirectoryWatcher - - -class AutoencoderSourceStage(PreallocatorMixin, SingleOutputSource): - """ - All AutoEncoder source stages must extend this class and implement the `files_to_dfs_per_user` abstract method. - Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input - data pass through to next stage. - - Extend this class to load messages from a files and dump contents into a DFP pipeline immediately. Useful for - testing performance and accuracy of a pipeline. - - Parameters - ---------- - c : `morpheus.config.Config` - Pipeline configuration instance. - input_glob : str - Input glob pattern to match files to read. For example, `./input_dir/*.json` would read all files with the - 'json' extension in the directory input_dir. - watch_directory : bool, default = False - The watch directory option instructs this stage to not close down once all files have been read. Instead it will - read all files that match the 'input_glob' pattern, and then continue to watch the directory for additional - files. Any new files that are added that match the glob will then be processed. - max_files: int, default = -1 - Max number of files to read. Useful for debugging to limit startup time. Default value of -1 is unlimited. - file_type : `morpheus.common.FileTypes`, default = 'FileTypes.Auto'. - Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. - Supported extensions: 'json', 'csv' - repeat: int, default = 1 - How many times to repeat the dataset. Useful for extending small datasets in debugging. - sort_glob : bool, default = False - If true the list of files matching `input_glob` will be processed in sorted order. - recursive: bool, default = True - If true, events will be emitted for the files in subdirectories that match `input_glob`. - queue_max_size: int, default = 128 - Maximum queue size to hold the file paths to be processed that match `input_glob`. - batch_timeout: float, default = 5.0 - Timeout to retrieve batch messages from the queue. - """ - - def __init__(self, - c: Config, - input_glob: str, - watch_directory: bool = False, - max_files: int = -1, - file_type: FileTypes = FileTypes.Auto, - repeat: int = 1, - sort_glob: bool = False, - recursive: bool = True, - queue_max_size: int = 128, - batch_timeout: float = 5.0): - - SingleOutputSource.__init__(self, c) - - self._input_glob = input_glob - self._file_type = file_type - - self._feature_columns = c.ae.feature_columns - self._user_column_name = c.ae.userid_column_name - self._userid_filter = c.ae.userid_filter - - self._input_count = None - - # Hold the max index we have seen to ensure sequential and increasing indexes - self._rows_per_user: typing.Dict[str, int] = {} - - # Iterative mode will emit dataframes one at a time. Otherwise a list of dataframes is emitted. Iterative mode - # is good for interleaving source stages. - self._repeat_count = repeat - - self._watcher = DirectoryWatcher(input_glob=input_glob, - watch_directory=watch_directory, - max_files=max_files, - sort_glob=sort_glob, - recursive=recursive, - queue_max_size=queue_max_size, - batch_timeout=batch_timeout, - should_stop_fn=self.is_stop_requested) - - @property - def input_count(self) -> int: - """Return None for no max input count""" - return self._input_count if self._input_count is not None else 0 - - def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(UserMessageMeta) - - def get_match_pattern(self, glob_split): - """Return a file match pattern""" - dir_to_watch = os.path.dirname(glob_split[0]) - match_pattern = self._input_glob.replace(dir_to_watch + "/", "", 1) - - return match_pattern - - @staticmethod - def repeat_df(df: pd.DataFrame, repeat_count: int) -> typing.List[pd.DataFrame]: - """ - This function iterates over the same dataframe to extending small datasets in debugging with incremental - updates to the `event_dt` and `eventTime` columns. - - Parameters - ---------- - df : pd.DataFrame - To be repeated dataframe. - repeat_count : int - Number of times the given dataframe should be repeated. - - Returns - ------- - df_array : typing.List[pd.DataFrame] - List of repeated dataframes. - """ - - df_array = [] - - df_array.append(df) - - for _ in range(1, repeat_count): - x = df.copy() - - # Now increment the timestamps by the interval in the df - x["event_dt"] = x["event_dt"] + (x["event_dt"].iloc[-1] - x["event_dt"].iloc[0]) - x["eventTime"] = x["event_dt"].dt.strftime("%Y-%m-%dT%H:%M:%SZ") - - df_array.append(x) - - # Set df for next iteration - df = x - - return df_array - - @staticmethod - def batch_user_split(x: typing.List[pd.DataFrame], - userid_column_name: str, - userid_filter: str, - datetime_column_name="event_dt"): - """ - Creates a dataframe for each userid. - - Parameters - ---------- - x : typing.List[pd.DataFrame] - List of dataframes. - userid_column_name : str - Name of a dataframe column used for categorization. - userid_filter : str - Only rows with the supplied userid are filtered. - datetime_column_name : str - Name of the dataframe column used to sort the rows. - - Returns - ------- - user_dfs : typing.Dict[str, pd.DataFrame] - Dataframes, each of which is associated with a single userid. - """ - - combined_df = pd.concat(x) - - if (datetime_column_name in combined_df): - - # Convert to date_time column - # combined_df["event_dt"] = pd.to_datetime(combined_df["eventTime"]) - - # Set the index name so we can sort first by time then by index (to keep things all in order). Then restore - # the name - saved_index_name = combined_df.index.name - - combined_df.index.name = "idx" - - # Sort by time - combined_df = combined_df.sort_values(by=[datetime_column_name, "idx"]) - - combined_df.index.name = saved_index_name - - # Get the users in this DF - unique_users = combined_df[userid_column_name].unique() - - user_dfs = {} - - for user_name in unique_users: - - if (userid_filter is not None and user_name != userid_filter): - continue - - # Get just this users data and make a copy to remove link to grouped DF - user_df = combined_df[combined_df[userid_column_name] == user_name].copy() - - user_dfs[user_name] = user_df - - return user_dfs - - @staticmethod - @abstractmethod - def files_to_dfs_per_user(x: typing.List[str], - userid_column_name: str, - feature_columns: typing.List[str], - userid_filter: str = None, - repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: - """ - Stages that extend `AutoencoderSourceStage` must implement this abstract function - in order to convert messages in the files to dataframes per userid. - - Parameters - ---------- - x : typing.List[str] - List of messages. - userid_column_name : str - Name of the column used for categorization. - feature_columns : typing.List[str] - Feature column names. - userid_filter : str - Only rows with the supplied userid are filtered. - repeat_count : str - Number of times the given rows should be repeated. - - Returns - ------- - : typing.Dict[str, pd.DataFrame] - Dataframe per userid. - """ - - pass - - @staticmethod - def derive_features(df: pd.DataFrame, feature_columns: typing.List[str]): # pylint: disable=unused-argument - """ - If any features are available to be derived, can be implemented by overriding this function. - - Parameters - ---------- - df : pd.DataFrame - A dataframe. - feature_columns : typing.List[str] - Names of columns that are need to be derived. - - Returns - ------- - df : typing.List[pd.DataFrame] - Dataframe with actual and derived columns. - """ - return df - - def _add_derived_features(self, x: typing.Dict[str, pd.DataFrame]): - - for user_name in x.keys(): - x[user_name] = self.derive_features(x[user_name], None) - - return x - - def _build_user_metadata(self, x: typing.Dict[str, pd.DataFrame]): - - user_metas = [] - - for user_name, user_df in x.items(): - - # See if we have seen this user before - if (user_name not in self._rows_per_user): - self._rows_per_user[user_name] = 0 - - # Combine the original index with itself so it shows up as a named column - user_df.index.name = "_index_" + (user_df.index.name or "") - user_df = user_df.reset_index() - - # Now ensure the index for this user is correct - user_df.index = range(self._rows_per_user[user_name], self._rows_per_user[user_name] + len(user_df)) - self._rows_per_user[user_name] += len(user_df) - - # Now make a UserMessageMeta with the user name - meta = UserMessageMeta(user_df, user_name) - - user_metas.append(meta) - - return user_metas - - def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: - # The first source just produces filenames - return self._watcher.build_node(self.unique_name, builder) - - def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) -> mrc.SegmentObject: - - # At this point, we have batches of filenames to process. Make a node for processing batches of - # filenames into batches of dataframes - post_node = builder.make_node( - self.unique_name + "-post", - ops.map( - partial( - self.files_to_dfs_per_user, - userid_column_name=self._user_column_name, - feature_columns=None, # Use None here to leave all columns in - userid_filter=self._userid_filter, - repeat_count=self._repeat_count)), - ops.map(self._add_derived_features), - # Now group the batch of dataframes into a single df, split by user, and send a single UserMessageMeta - # per user - ops.map(self._build_user_metadata), - # Finally flatten to single meta - ops.flatten()) - builder.make_edge(out_node, post_node) - - return super()._post_build_single(builder, post_node) diff --git a/python/morpheus/morpheus/stages/input/azure_source_stage.py b/python/morpheus/morpheus/stages/input/azure_source_stage.py deleted file mode 100644 index 38661e3fc4..0000000000 --- a/python/morpheus/morpheus/stages/input/azure_source_stage.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import typing - -import pandas as pd - -from morpheus.cli import register_stage -from morpheus.config import PipelineModes -from morpheus.stages.input.autoencoder_source_stage import AutoencoderSourceStage - -logger = logging.getLogger(__name__) - - -@register_stage("from-azure", modes=[PipelineModes.AE]) -class AzureSourceStage(AutoencoderSourceStage): - """ - Source stage is used to load Azure Active Directory messages. - - Adds the following derived features: - - `appincrement`: Increments every time the logs contain a distinct app. - - `locincrement`: Increments every time a log contains a distinct city within a day. - - `logcount`: Tracks the number of logs generated by a user within a day. - - Parameters - ---------- - c : `morpheus.config.Config` - Pipeline configuration instance. - input_glob : str - Input glob pattern to match files to read. For example, `./input_dir/*.json` would read all files with the - 'json' extension in the directory input_dir. - watch_directory : bool, default = False - The watch directory option instructs this stage to not close down once all files have been read. Instead it will - read all files that match the 'input_glob' pattern, and then continue to watch the directory for additional - files. Any new files that are added that match the glob will then be processed. - max_files: int, default = -1 - Max number of files to read. Useful for debugging to limit startup time. Default value of -1 is unlimited. - file_type : `morpheus.common.FileTypes`, default = 'FileTypes.Auto'. - Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. - Supported extensions: 'json', 'csv' - repeat: int, default = 1 - How many times to repeat the dataset. Useful for extending small datasets in debugging. - sort_glob : bool, default = False - If true the list of files matching `input_glob` will be processed in sorted order. - recursive: bool, default = True - If true, events will be emitted for the files in subdirectories that match `input_glob`. - queue_max_size: int, default = 128 - Maximum queue size to hold the file paths to be processed that match `input_glob`. - batch_timeout: float, default = 5.0 - Timeout to retrieve batch messages from the queue. - """ - - @property - def name(self) -> str: - return "from-azure" - - def supports_cpp_node(self): - return False - - @staticmethod - def change_columns(df): - """ - Removes characters (_,.,{,},:) from the names of the dataframe columns. - - Parameters - ---------- - df : `pd.DataFrame` - Dataframe that requires column renaming. - - Returns - ------- - df : `pd.DataFrame` - Dataframe with renamed columns. - """ - - df.columns = df.columns.str.replace('[_,.,{,},:]', '') - df.columns = df.columns.str.strip() - return df - - @staticmethod - def derive_features(df: pd.DataFrame, feature_columns: typing.List[str]): - """ - Derives feature columns from the AzureAD (logs) source columns. - - Parameters - ---------- - df : pd.DataFrame - Dataframe for deriving columns. - feature_columns : typing.List[str] - Names of columns that are need to be derived. - - Returns - ------- - df : typing.List[pd.DataFrame] - Dataframe with actual and derived columns. - """ - - default_date = '1970-01-01T00:00:00.000000+00:00' - timestamp_column = "createdDateTime" - city_column = "locationcity" - state_column = "locationstate" - country_column = "locationcountryOrRegion" - application_column = "appDisplayName" - - df = AzureSourceStage.change_columns(df) - df['time'] = pd.to_datetime(df[timestamp_column], errors='coerce') - df['day'] = df['time'].dt.date - df.fillna({'time': pd.to_datetime(default_date), 'day': pd.to_datetime(default_date).date()}, inplace=True) - df.sort_values(by=['time'], inplace=True) - - overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None] - overall_location_df = df[overall_location_columns].fillna('nan') - df['overall_location'] = overall_location_df.apply(lambda x: ', '.join(x), axis=1) - df['loc_cat'] = df.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) - df.fillna({'loc_cat': 1}, inplace=True) - df['locincrement'] = df.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) - df.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) - - df['app_cat'] = df.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) - df.fillna({'app_cat': 1}, inplace=True) - df['appincrement'] = df.groupby('day')['app_cat'].expanding(1).max().droplevel(0) - df.drop('app_cat', inplace=True, axis=1) - - df["logcount"] = df.groupby('day').cumcount() - - if (feature_columns is not None): - df.drop(columns=df.columns.difference(feature_columns), inplace=True) - - return df - - @staticmethod - def files_to_dfs_per_user(x: typing.List[str], - userid_column_name: str, - feature_columns: typing.List[str], - userid_filter: str = None, - repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: - """ - After loading the input batch of AzureAD logs into a dataframe, this method builds a dataframe - for each set of userid rows in accordance with the specified filter condition. - - Parameters - ---------- - x : typing.List[str] - List of messages. - userid_column_name : str - Name of the column used for categorization. - feature_columns : typing.List[str] - Feature column names. - userid_filter : str - Only rows with the supplied userid are filtered. - repeat_count : str - Number of times the given rows should be repeated. - - Returns - ------- - df_per_user : typing.Dict[str, pd.DataFrame] - Dataframe per userid. - """ - - dfs = [] - for file in x: - df = pd.read_json(file, orient="records") - df = pd.json_normalize(df['properties']) - dfs = dfs + AutoencoderSourceStage.repeat_df(df, repeat_count) - - df_per_user = AutoencoderSourceStage.batch_user_split(dfs, userid_column_name, userid_filter) - - return df_per_user diff --git a/python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py b/python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py deleted file mode 100644 index 968fee7ef2..0000000000 --- a/python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import typing - -import numpy as np -import pandas as pd - -from morpheus.cli import register_stage -from morpheus.common import FileTypes -from morpheus.common import determine_file_type -from morpheus.config import PipelineModes -from morpheus.io.deserializers import read_file_to_df -from morpheus.stages.input.autoencoder_source_stage import AutoencoderSourceStage - -logger = logging.getLogger(__name__) - - -@register_stage("from-cloudtrail", modes=[PipelineModes.AE]) -class CloudTrailSourceStage(AutoencoderSourceStage): - """ - Load messages from a CloudTrail directory. - - """ - - @property - def name(self) -> str: - return "from-cloudtrail" - - @property - def input_count(self) -> int: - """Return None for no max intput count""" - return self._input_count - - def supports_cpp_node(self): - return False - - def get_match_pattern(self, glob_split): - """Return a file match pattern""" - dir_to_watch = os.path.dirname(glob_split[0]) - match_pattern = self._input_glob.replace(dir_to_watch + "/", "", 1) - - return match_pattern - - @staticmethod - def read_file(filename: str, file_type: FileTypes) -> pd.DataFrame: - """ - Reads a file into a dataframe. - - Parameters - ---------- - filename : str - Path to a file to read. - file_type : `morpheus.common.FileTypes` - What type of file to read. Leave as Auto to auto detect based on the file extension. - - Returns - ------- - pandas.DataFrame - The parsed dataframe. - - Raises - ------ - RuntimeError - If an unsupported file type is detected. - """ - - df = read_file_to_df(filename, file_type, df_type="pandas") - - # If reading the file only produced one line and we are a JSON file, try loading structured file - if (determine_file_type(filename) == FileTypes.JSON and len(df) == 1 and list(df) == ["Records"]): - - # Reread with lines=False - df = read_file_to_df(filename, file_type, df_type="pandas", parser_kwargs={"lines": False}) - - # Normalize - df = pd.json_normalize(df['Records']) - - return df - - @staticmethod - def cleanup_df(df: pd.DataFrame, feature_columns: typing.List[str]): - """ - This function does clean up certain columns in the dataframe. - - Parameters - ---------- - df : pd.DataFrame - Dataframe for columns cleanup. - feature_columns : typing.List[str] - Only the columns that are present in the feature columns will be preserved in the dataframe - if feature columns are supplied.. - - Returns - ------- - df : typing.List[pd.DataFrame] - Clean dataframe. - """ - - # Replace all the dots in column names - df.columns = df.columns.str.replace('.', '', regex=False) - - df["event_dt"] = pd.to_datetime(df["eventTime"]) - - def remove_null(x): - - if isinstance(x, list): - if isinstance(x[0], dict): - key = list(x[0].keys()) - return x[0][key[0]] - return x - - def clean_column(cloudtrail_df): - - col_name = 'requestParametersownersSetitems' - if (col_name in cloudtrail_df): - cloudtrail_df[col_name] = cloudtrail_df[col_name].apply(lambda x: remove_null(x)) - return cloudtrail_df - - # Drop any unneeded columns if specified - if (feature_columns is not None): - df.drop(columns=df.columns.difference(feature_columns), inplace=True) - - # Reorder columns to be the same - # df = df[pd.Index(feature_columns).intersection(df.columns)] - - # Convert a numerical account ID into a string - if ("userIdentityaccountId" in df and df["userIdentityaccountId"].dtype != np.dtype('O')): - df['userIdentityaccountId'] = 'Account-' + df['userIdentityaccountId'].astype(str) - - df = clean_column(df) - - return df - - @staticmethod - def files_to_dfs_per_user(x: typing.List[str], - userid_column_name: str, - feature_columns: typing.List[str], - userid_filter: str = None, - repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: - """ - After loading the input batch of CloudTrail logs into a dataframe, this method builds a dataframe - for each set of userid rows in accordance with the specified filter condition. - - Parameters - ---------- - x : typing.List[str] - List of messages. - userid_column_name : str - Name of the column used for categorization. - feature_columns : typing.List[str] - Feature column names. - userid_filter : str - Only rows with the supplied userid are filtered. - repeat_count : str - Number of times the given rows should be repeated. - - Returns - ------- - df_per_user : typing.Dict[str, pd.DataFrame] - Dataframe per userid. - """ - - # Using pandas to parse nested JSON until cuDF adds support - # https://github.com/rapidsai/cudf/issues/8827 - dfs = [] - for file in x: - df = CloudTrailSourceStage.read_file(file, FileTypes.Auto) - df = CloudTrailSourceStage.cleanup_df(df, feature_columns) - dfs = dfs + CloudTrailSourceStage.repeat_df(df, repeat_count) - - df_per_user = CloudTrailSourceStage.batch_user_split(dfs, userid_column_name, userid_filter) - - return df_per_user diff --git a/python/morpheus/morpheus/stages/input/duo_source_stage.py b/python/morpheus/morpheus/stages/input/duo_source_stage.py deleted file mode 100644 index 8f5e9c86c0..0000000000 --- a/python/morpheus/morpheus/stages/input/duo_source_stage.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Sourse stage for Duo Authentication logs.""" - -import json -import logging -import typing - -import pandas as pd - -from morpheus.cli import register_stage -from morpheus.config import PipelineModes -from morpheus.stages.input.autoencoder_source_stage import AutoencoderSourceStage - -DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' -logger = logging.getLogger(__name__) - - -@register_stage("from-duo", modes=[PipelineModes.AE]) -class DuoSourceStage(AutoencoderSourceStage): - """ - Source stage is used to load Duo Authentication messages. - - Adds the following derived features: - - `locincrement`: Increments every time a log contains a distinct city within a day. - - `logcount`: Tracks the number of logs generated by a user within a day. - - Parameters - ---------- - c : `morpheus.config.Config` - Pipeline configuration instance. - input_glob : str - Input glob pattern to match files to read. For example, `./input_dir/*.json` would read all files with the - 'json' extension in the directory input_dir. - watch_directory : bool, default = False - The watch directory option instructs this stage to not close down once all files have been read. Instead it will - read all files that match the 'input_glob' pattern, and then continue to watch the directory for additional - files. Any new files that are added that match the glob will then be processed. - max_files: int, default = -1 - Max number of files to read. Useful for debugging to limit startup time. Default value of -1 is unlimited. - file_type : `morpheus.common.FileTypes`, default = 'FileTypes.Auto'. - Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. - Supported extensions: 'json', 'csv' - repeat: int, default = 1 - How many times to repeat the dataset. Useful for extending small datasets in debugging. - sort_glob : bool, default = False - If true the list of files matching `input_glob` will be processed in sorted order. - recursive: bool, default = True - If true, events will be emitted for the files in subdirectories that match `input_glob`. - queue_max_size: int, default = 128 - Maximum queue size to hold the file paths to be processed that match `input_glob`. - batch_timeout: float, default = 5.0 - Timeout to retrieve batch messages from the queue. - """ - - @property - def name(self) -> str: - """Unique name for the stage.""" - return "from-duo" - - def supports_cpp_node(self): - """Indicate that this stages does not support a C++ node.""" - return False - - @staticmethod - def change_columns(df): - """ - Removes characters (_,.,{,},:) from the names of the dataframe columns. - - Parameters - ---------- - df : `pd.DataFrame` - Dataframe that requires column renaming. - - Returns - ------- - df : `pd.DataFrame` - Dataframe with renamed columns. - """ - df.columns = df.columns.str.replace('[_,.,{,},:]', '') - df.columns = df.columns.str.strip() - return df - - @staticmethod - def derive_features(df: pd.DataFrame, feature_columns: typing.List[str]): - """ - Derives feature columns from the DUO (logs) source columns. - - Parameters - ---------- - df : pd.DataFrame - Dataframe for deriving columns. - feature_columns : typing.List[str] - Names of columns that are need to be derived. - - Returns - ------- - df : typing.List[pd.DataFrame] - Dataframe with actual and derived columns. - """ - timestamp_column = "isotimestamp" - city_column = "accessdevicelocationcity" - state_column = "accessdevicelocationstate" - country_column = "accessdevicelocationcountry" - - df['time'] = pd.to_datetime(df[timestamp_column], errors='coerce') - df['day'] = df['time'].dt.date - df.fillna({'time': pd.to_datetime(DEFAULT_DATE), 'day': pd.to_datetime(DEFAULT_DATE).date()}, inplace=True) - df.sort_values(by=['time'], inplace=True) - - overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None] - overall_location_df = df[overall_location_columns].fillna('nan') - df['overall_location'] = overall_location_df.apply(lambda x: ', '.join(x), axis=1) - df['loc_cat'] = df.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) - df.fillna({'loc_cat': 1}, inplace=True) - df['locincrement'] = df.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) - df.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) - - df["logcount"] = df.groupby('day').cumcount() - - if (feature_columns is not None): - df.drop(columns=df.columns.difference(feature_columns), inplace=True) - - return df - - @staticmethod - def files_to_dfs_per_user(x: typing.List[str], - userid_column_name: str, - feature_columns: typing.List[str], - userid_filter: str = None, - repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: - """ - After loading the input batch of DUO logs into a dataframe, this method builds a dataframe - for each set of userid rows in accordance with the specified filter condition. - - Parameters - ---------- - x : typing.List[str] - List of messages. - userid_column_name : str - Name of the column used for categorization. - feature_columns : typing.List[str] - Feature column names. - userid_filter : str - Only rows with the supplied userid are filtered. - repeat_count : str - Number of times the given rows should be repeated. - - Returns - ------- - df_per_user : typing.Dict[str, pd.DataFrame] - Dataframe per userid. - """ - dfs = [] - for file in x: - with open(file, encoding='UTF-8') as json_in: - log = json.load(json_in) - df = pd.json_normalize(log) - df = DuoSourceStage.change_columns(df) - dfs = dfs + AutoencoderSourceStage.repeat_df(df, repeat_count) - - df_per_user = AutoencoderSourceStage.batch_user_split(dfs, userid_column_name, userid_filter) - - return df_per_user diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py deleted file mode 100644 index c7c639eddd..0000000000 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_ae_stage.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import typing -from functools import partial - -import cupy as cp -import mrc - -import morpheus._lib.messages as _messages -from morpheus.cli.register_stage import register_stage -from morpheus.config import Config -from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage -from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage - -logger = logging.getLogger(__name__) - - -@register_stage("preprocess", modes=[PipelineModes.AE]) -class PreprocessAEStage(PreprocessBaseStage): - """ - Prepare Autoencoder input DataFrames for inference. - - Parameters - ---------- - c : morpheus.config.Config - Pipeline configuration instance. - - """ - - def __init__(self, c: Config): - super().__init__(c) - - self._fea_length = c.feature_length - self._feature_columns = c.ae.feature_columns - - @property - def name(self) -> str: - return "preprocess-ae" - - def accepted_types(self) -> typing.Tuple: - """ - Returns accepted input types for this stage. - """ - return (ControlMessage, ) - - def supports_cpp_node(self): - return False - - @staticmethod - def pre_process_batch(msg: ControlMessage, fea_len: int, feature_columns: typing.List[str]) -> ControlMessage: - """ - This function performs pre-processing for autoencoder. - - Parameters - ---------- - msg : morpheus.messages.ControlMessage - Input rows received from Deserialized stage. - fea_len : int - Number of input features. - feature_columns : typing.List[str] - List of feature columns. - - Returns - ------- - morpheus.messages.ControlMessage - - """ - meta_df = msg.payload().get_data(msg.payload().df.columns.intersection(feature_columns)) - - autoencoder = msg.get_metadata("model") - scores_mean = msg.get_metadata("train_scores_mean") - scores_std = msg.get_metadata("train_scores_std") - count = len(meta_df.index) - - inputs = cp.zeros(meta_df.shape, dtype=cp.float32) - - if autoencoder is not None: - data = autoencoder.prepare_df(meta_df) - inputs = autoencoder.build_input_tensor(data) - inputs = cp.asarray(inputs.detach()) - count = inputs.shape[0] - - seg_ids = cp.zeros((count, 3), dtype=cp.uint32) - seg_ids[:, 0] = cp.arange(0, count, dtype=cp.uint32) - seg_ids[:, 2] = fea_len - 1 - - msg.set_metadata("model", autoencoder) - msg.set_metadata("train_scores_mean", scores_mean) - msg.set_metadata("train_scores_std", scores_std) - msg.tensors(_messages.TensorMemory(count=count, tensors={"input": inputs, "seq_ids": seg_ids})) - return msg - - def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: - return partial(PreprocessAEStage.pre_process_batch, - fea_len=self._fea_length, - feature_columns=self._feature_columns) - - def _get_preprocess_node(self, builder: mrc.Builder): - raise NotImplementedError("No C++ node for AE") diff --git a/python/morpheus/morpheus/stages/preprocess/train_ae_stage.py b/python/morpheus/morpheus/stages/preprocess/train_ae_stage.py deleted file mode 100644 index ae246c4015..0000000000 --- a/python/morpheus/morpheus/stages/preprocess/train_ae_stage.py +++ /dev/null @@ -1,345 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import glob -import importlib -import logging -import pathlib -import typing - -import dill -import mrc -import pandas as pd -from mrc.core import operators as ops - -from morpheus.cli.register_stage import register_stage -from morpheus.config import Config -from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import UserMessageMeta -from morpheus.models.dfencoder import AutoEncoder -from morpheus.pipeline.control_message_stage import ControlMessageStage -from morpheus.pipeline.stage_schema import StageSchema -from morpheus.utils.seed import manual_seed - -logger = logging.getLogger(__name__) - - -class _UserModelManager: - - def __init__(self, - config: Config, - user_id: str, - save_model: bool, - epochs: int, - max_history: int, - seed: int = None) -> None: - super().__init__() - - self._user_id = user_id - self._history: pd.DataFrame = None - self._max_history: int = max_history - self._seed: int = seed - self._feature_columns = config.ae.feature_columns - self._feature_scaler = config.ae.feature_scaler - self._epochs = epochs - self._save_model = save_model - - self._model: AutoEncoder = None - self._train_scores_mean = None - self._train_scores_std = None - - @property - def model(self): - return self._model - - @property - def train_scores_mean(self): - return self._train_scores_mean - - @property - def train_scores_std(self): - return self._train_scores_std - - def train(self, df: pd.DataFrame) -> AutoEncoder: - - # Determine how much history to save - if (self._history is not None): - to_drop = max(len(df) + len(self._history) - self._max_history, 0) - - history = self._history.iloc[to_drop:, :] - - train_df = pd.concat([history, df]) - else: - train_df = df - - # If the seed is set, enforce that here - if (self._seed is not None): - manual_seed(self._seed) - - model = AutoEncoder( - encoder_layers=[512, 500], # layers of the encoding part - decoder_layers=[512], # layers of the decoding part - activation='relu', # activation function - swap_probability=0.2, # noise parameter - learning_rate=0.01, # learning rate - learning_rate_decay=.99, # learning decay - batch_size=512, - # logger='ipynb', - verbose=False, - optimizer='sgd', # SGD optimizer is selected(Stochastic gradient descent) - scaler=self._feature_scaler, # feature scaling method - min_cats=1, # cut off for minority categories - progress_bar=False) - - logger.debug("Training AE model for user: '%s'...", self._user_id) - model.fit(train_df, epochs=self._epochs) - train_loss_scores = model.get_anomaly_score(train_df) - scores_mean = train_loss_scores.mean() - scores_std = train_loss_scores.std() - - logger.debug("Training AE model for user: '%s'... Complete.", self._user_id) - - if (self._save_model): - self._model = model - self._train_scores_mean = scores_mean - self._train_scores_std = scores_std - - # Save the history for next time - self._history = train_df.iloc[max(0, len(train_df) - self._max_history):, :] - - return model, scores_mean, scores_std - - -@register_stage("train-ae", modes=[PipelineModes.AE]) -class TrainAEStage(ControlMessageStage): - """ - Train an Autoencoder model on incoming data. - - This stage is used to train an Autoencoder model on incoming data a supply that model to downstream stages. The - Autoencoder workflows use this stage as a pre-processing step to build the model for inference. - - Parameters - ---------- - c : morpheus.config.Config - Pipeline configuration instance. - pretrained_filename : pathlib.Path, default = None - Loads a single pre-trained model for all users. - train_data_glob : str, default = None - On startup, all files matching this glob pattern will be loaded and used to train a model for each unique user - ID. - source_stage_class : str, default = None - If train_data_glob provided, use source stage to batch training data per user. - train_epochs : int, default = 25, min = 1 - The number of epochs to train user models for. Passed in as the `epoch` parameter to `AutoEncoder.fit` causes - data to be trained in `train_epochs` batches. - train_min_history : int, default = 300 - Minimum number of rows to train user model. - train_max_history : int, default = 1000, min = 1 - Maximum amount of rows that will be retained in history. As new data arrives, models will be retrained with a - maximum number of rows specified by this value. - seed : int, default = None - Seed to use when training. When not None, ensure random number generators are seeded with `seed` to control - reproducibility of user model training. - sort_glob : bool, default = False, is_flag = True - If true the list of files matching `input_glob` will be processed in sorted order. - models_output_filename : pathlib.Path, default = None, writable = True - The location to write trained models to. - """ - - def __init__(self, - c: Config, - pretrained_filename: pathlib.Path = None, - train_data_glob: str = None, - source_stage_class: str = None, - train_epochs: int = 25, - train_min_history: int = 300, - train_max_history: int = 1000, - seed: int = None, - sort_glob: bool = False, - models_output_filename: pathlib.Path = None): - super().__init__(c) - - self._config = c - self._feature_columns = c.ae.feature_columns - self._use_generic_model = c.ae.use_generic_model - self._batch_size = c.pipeline_batch_size - self._pretrained_filename = pretrained_filename - self._train_data_glob: str = train_data_glob - self._train_epochs = train_epochs - self._train_min_history = train_min_history - self._train_max_history = train_max_history - self._seed = seed - self._sort_glob = sort_glob - self._models_output_filename = models_output_filename - - self._source_stage_class = source_stage_class - if self._source_stage_class is not None: - source_stage_module, source_stage_classname = self._source_stage_class.rsplit('.', 1) - # load the source stage module, will raise ImportError if module cannot be loaded - source_stage_module = importlib.import_module(source_stage_module) - # get the source stage class, will raise AttributeError if class cannot be found - self._source_stage_class = getattr(source_stage_module, source_stage_classname) - - # Single model for the entire pipeline - self._pretrained_model: AutoEncoder = None - - # Per user model data - self._user_models: typing.Dict[str, _UserModelManager] = {} - - @property - def name(self) -> str: - return "train-ae" - - def accepted_types(self) -> typing.Tuple: - """ - Returns accepted input types for this stage. - - """ - return (UserMessageMeta, ) - - def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(ControlMessage) - - def supports_cpp_node(self): - return False - - def _get_per_user_model(self, x: UserMessageMeta): - - model = None - train_scores_mean = None - train_scores_std = None - user_model = None - - if x.user_id in self._user_models: - user_model = self._user_models[x.user_id] - elif self._use_generic_model and "generic" in self._user_models.keys(): - user_model = self._user_models["generic"] - - if (user_model is not None): - model = user_model.model - train_scores_mean = user_model.train_scores_mean - train_scores_std = user_model.train_scores_std - - return model, train_scores_mean, train_scores_std - - def _train_model(self, x: UserMessageMeta) -> list[ControlMessage]: - - if (x.user_id not in self._user_models): - self._user_models[x.user_id] = _UserModelManager(self._config, - x.user_id, - False, - self._train_epochs, - self._train_max_history, - self._seed) - - return self._user_models[x.user_id].train(x.df) - - def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: - get_model_fn = None - - # If a pretrained model was specified, load that now - if (self._pretrained_filename is not None): - if (self._train_data_glob is not None): - logger.warning("Both 'pretrained_filename' and 'train_data_glob' were specified. " - "The 'train_data_glob' will be ignored") - - with open(self._pretrained_filename, 'rb') as in_strm: - # self._pretrained_model = dill.load(in_strm) - self._user_models = dill.load(in_strm) - - # get_model_fn = self._get_pretrained_model - get_model_fn = self._get_per_user_model - - elif (self._train_data_glob is not None): - if (self._source_stage_class is None): - raise RuntimeError("source_stage_class must be provided with train_data_glob") - file_list = glob.glob(self._train_data_glob) - if self._sort_glob: - file_list = sorted(file_list) - - user_to_df = self._source_stage_class.files_to_dfs_per_user(file_list, - self._config.ae.userid_column_name, - self._feature_columns, - self._config.ae.userid_filter) - - if self._use_generic_model: - self._user_models["generic"] = _UserModelManager(self._config, - "generic", - True, - self._train_epochs, - self._train_max_history, - self._seed) - - all_users_df = pd.concat(user_to_df.values(), ignore_index=True) - all_users_df = self._source_stage_class.derive_features(all_users_df, self._feature_columns) - all_users_df = all_users_df.fillna("nan") - self._user_models["generic"].train(all_users_df) - - for user_id, df in user_to_df.items(): - if len(df.index) >= self._train_min_history: - self._user_models[user_id] = _UserModelManager(self._config, - user_id, - True, - self._train_epochs, - self._train_max_history, - self._seed) - - # Derive features here - # print(df) - df = self._source_stage_class.derive_features(df, self._feature_columns) - df = df.fillna("nan") - self._user_models[user_id].train(df) - - # Save trained user models - if self._models_output_filename is not None: - with open(self._models_output_filename, 'wb') as out_strm: - dill.dump(self._user_models, out_strm) - - get_model_fn = self._get_per_user_model - - else: - get_model_fn = self._train_model - - def on_next(x: UserMessageMeta): - - model, scores_mean, scores_std = get_model_fn(x) - - # cuDF does not yet support timezone-aware datetimes - # Remove timezone information from pd.DatetimeTZDtype columns - with x.mutable_dataframe() as df: - for col in [col for col in df.columns if isinstance(df[col].dtype, pd.DatetimeTZDtype)]: - df[col] = df[col].dt.tz_convert(None) - - full_message = ControlMessage() - full_message.payload(x) - full_message.set_metadata("model", model) - full_message.set_metadata("train_scores_mean", scores_mean) - full_message.set_metadata("train_scores_std", scores_std) - - to_send = [] - - # Now split into batches - for i in range(0, full_message.payload().count, self._batch_size): - output_message = ControlMessage(full_message) - output_message.payload(full_message.payload().get_slice( - i, min(i + self._batch_size, full_message.payload().count))) - to_send.append(output_message) - - return to_send - - node = builder.make_node(self.unique_name, ops.map(on_next), ops.flatten()) - builder.make_edge(input_node, node) - - return node diff --git a/scripts/validation/hammah/val-hammah-all.sh b/scripts/validation/hammah/val-hammah-all.sh deleted file mode 100755 index ea5021bc37..0000000000 --- a/scripts/validation/hammah/val-hammah-all.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e +o pipefail -# set -x -# set -v - - -# RUN OPTIONS -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - -# Call minibert first -${SCRIPT_DIR}/val-hammah.sh "user123" -${SCRIPT_DIR}/val-hammah.sh "role-g" diff --git a/scripts/validation/hammah/val-hammah.sh b/scripts/validation/hammah/val-hammah.sh deleted file mode 100755 index 1201502e91..0000000000 --- a/scripts/validation/hammah/val-hammah.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e +o pipefail -# set -x -# set -v - -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" - -# Override the global defaults -RUN_PYTORCH=1 -RUN_TRITON_ONNX=0 - -# Load the utility scripts -source ${SCRIPT_DIR}/../val-run-pipeline.sh - -# Get the model/data from the argument. Must be 'role-g' or 'user123' -HAMMAH_TYPE=${HAMMAH_TYPE:-$1} - -HAMMAH_INPUT_FILE=${SID_INPUT_FILE:-"${MORPHEUS_ROOT}/models/datasets/validation-data/hammah-${HAMMAH_TYPE}-validation-data.csv"} -HAMMAH_TRUTH_FILE=${SID_TRUTH_FILE:-"${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-${HAMMAH_TYPE}-validation-data-output.csv"} - -MODEL_FILE=${MODEL_FILE:-"${MORPHEUS_ROOT}/models/hammah-models/hammah-${HAMMAH_TYPE}-20211017.pkl"} -MODEL_DIRECTORY=${MODEL_FILE%/*} -MODEL_FILENAME=$(basename -- "${MODEL_FILE}") -MODEL_EXTENSION="${MODEL_FILENAME##*.}" -MODEL_NAME="${MODEL_FILENAME%.*}" - -OUTPUT_FILE_BASE="${MORPHEUS_ROOT}/.tmp/val_${MODEL_NAME}-" - -if [[ "${RUN_PYTORCH}" = "1" ]]; then - OUTPUT_FILE="${OUTPUT_FILE_BASE}pytorch.csv" - VAL_OUTPUT_FILE="${OUTPUT_FILE_BASE}pytorch-results.json" - - run_pipeline_hammah_${HAMMAH_TYPE} \ - "${HAMMAH_INPUT_FILE}" \ - "inf-pytorch" \ - "${OUTPUT_FILE}" \ - "${HAMMAH_TRUTH_FILE}" \ - "${VAL_OUTPUT_FILE}" - - # Get the diff - PYTORCH_ERROR="${b}$(calc_error_val ${VAL_OUTPUT_FILE})" -else - PYTORCH_ERROR="${y}Skipped" -fi - -if [[ "${RUN_TRITON_ONNX}" = "1" ]]; then - - load_triton_model "phishing-bert-onnx" - - OUTPUT_FILE="${OUTPUT_FILE_BASE}triton-onnx.csv" - VAL_OUTPUT_FILE="${OUTPUT_FILE_BASE}triton-onnx-results.json" - - run_pipeline_hammah_${HAMMAH_TYPE} \ - "${HAMMAH_INPUT_FILE}" \ - "inf-triton --model_name=phishing-bert-onnx --server_url=${TRITON_URL} --force_convert_inputs=True" \ - "${OUTPUT_FILE}" \ - "${HAMMAH_TRUTH_FILE}" \ - "${VAL_OUTPUT_FILE}" - - # Get the diff - TRITON_ONNX_ERROR="${b}$(calc_error_val ${VAL_OUTPUT_FILE})" -else - TRITON_ONNX_ERROR="${y}Skipped" -fi - -if [[ "${RUN_TRITON_TRT}" = "1" ]]; then - load_triton_model "phishing-bert-trt" - - OUTPUT_FILE="${OUTPUT_FILE_BASE}triton-trt.csv" - VAL_OUTPUT_FILE="${OUTPUT_FILE_BASE}triton-trt-results.json" - - run_pipeline_hammah_${HAMMAH_TYPE} \ - "${HAMMAH_INPUT_FILE}" \ - "inf-triton --model_name=phishing-bert-trt --server_url=${TRITON_URL} --force_convert_inputs=True" \ - "${OUTPUT_FILE}" \ - "${HAMMAH_TRUTH_FILE}" \ - "${VAL_OUTPUT_FILE}" - - # Get the diff - TRITON_TRT_ERROR="${b}$(calc_error_val ${VAL_OUTPUT_FILE})" -else - TRITON_TRT_ERROR="${y}Skipped" -fi - -if [[ "${RUN_TENSORRT}" = "1" ]]; then - # Generate the TensorRT model - cd ${MORPHEUS_ROOT}/models/triton-model-repo/sid-${SID_TYPE}-trt/1 - - echo "Generating the TensorRT model. This may take a minute..." - morpheus tools onnx-to-trt --input_model ${MODEL_DIRECTORY}/${MODEL_NAME}.onnx --output_model ./sid-${SID_TYPE}-trt_b1-8_b1-16_b1-32.engine --batches 1 8 --batches 1 16 --batches 1 32 --seq_length 256 --max_workspace_size 16000 - - cd ${MORPHEUS_ROOT} - - OUTPUT_FILE="${OUTPUT_FILE_BASE}tensorrt.csv" - VAL_OUTPUT_FILE="${OUTPUT_FILE_BASE}tensorrt-results.json" - - run_pipeline_hammah_${HAMMAH_TYPE} \ - "${HAMMAH_INPUT_FILE}" \ - "inf-triton --model_name=sid-${SID_TYPE}-trt --server_url=${TRITON_URL} --force_convert_inputs=True" \ - "${OUTPUT_FILE}" \ - "${HAMMAH_TRUTH_FILE}" \ - "${VAL_OUTPUT_FILE}" - - # Get the diff - TRITON_TRT_ERROR="${b}$(calc_error_val ${VAL_OUTPUT_FILE})" - -else - TENSORRT_ERROR="${y}Skipped" -fi - -echo -e "${b}===ERRORS===${x}" -echo -e "PyTorch :${PYTORCH_ERROR}${x}" -echo -e "Triton(ONNX):${TRITON_ONNX_ERROR}${x}" -echo -e "Triton(TRT) :${TRITON_TRT_ERROR}${x}" -echo -e "TensorRT :${TENSORRT_ERROR}${x}" - -echo -e "${g}Complete!${x}" diff --git a/scripts/validation/val-run-pipeline.sh b/scripts/validation/val-run-pipeline.sh index 0af859a6e0..83f76d80d8 100755 --- a/scripts/validation/val-run-pipeline.sh +++ b/scripts/validation/val-run-pipeline.sh @@ -112,47 +112,3 @@ function run_pipeline_phishing_email(){ serialize \ to-file --filename=${OUTPUT_FILE} --overwrite } - -function run_pipeline_hammah_user123(){ - - INPUT_FILE=$1 - INFERENCE_STAGE=$2 - OUTPUT_FILE=$3 - VAL_FILE=$4 - VAL_OUTPUT=$5 - - morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ - pipeline-ae --columns_file="${MORPHEUS_ROOT}/python/morpheus/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="user123" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \ - from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \ - train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \ - preprocess \ - ${INFERENCE_STAGE} \ - add-scores \ - timeseries --resolution=1m --zscore_threshold=8.0 --hot_start \ - monitor --description "Inference Rate" --smoothing=0.001 --unit inf \ - validate --val_file_name=${VAL_FILE} --results_file_name=${VAL_OUTPUT} --index_col="_index_" --exclude "event_dt" --rel_tol=0.1 --overwrite \ - serialize \ - to-file --filename=${OUTPUT_FILE} --overwrite -} - -function run_pipeline_hammah_role-g(){ - - INPUT_FILE=$1 - INFERENCE_STAGE=$2 - OUTPUT_FILE=$3 - VAL_FILE=$4 - VAL_OUTPUT=$5 - - morpheus --log_level=DEBUG run --num_threads=$(nproc) --pipeline_batch_size=1024 --model_max_batch_size=1024 --use_cpp=${USE_CPP} \ - pipeline-ae --columns_file="${MORPHEUS_ROOT}/python/morpheus/morpheus/data/columns_ae_cloudtrail.txt" --userid_filter="role-g" --userid_column_name="userIdentitysessionContextsessionIssueruserName" --timestamp_column_name="event_dt" \ - from-cloudtrail --input_glob="${MORPHEUS_ROOT}/models/datasets/validation-data/dfp-cloudtrail-*-input.csv" \ - train-ae --train_data_glob="${MORPHEUS_ROOT}/models/datasets/training-data/dfp-cloudtrail-*.csv" --source_stage_class=morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage --seed 42 \ - preprocess \ - ${INFERENCE_STAGE} \ - add-scores \ - timeseries --resolution=1m --zscore_threshold=8.0 --hot_start \ - monitor --description "Inference Rate" --smoothing=0.001 --unit inf \ - validate --val_file_name=${VAL_FILE} --results_file_name=${VAL_OUTPUT} --index_col="_index_" --exclude "event_dt" --rel_tol=0.15 --overwrite \ - serialize \ - to-file --filename=${OUTPUT_FILE} --overwrite -} diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md index 9aa0bd105a..f7ef4f5538 100644 --- a/tests/benchmarks/README.md +++ b/tests/benchmarks/README.md @@ -193,7 +193,7 @@ Additional benchmark stats for each workflow: ### Production DFP E2E Benchmarks -Note that the `test_cloudtrail_ae_e2e` benchmarks measure performance of a pipeline built using [Starter DFP](../../examples/digital_fingerprinting/starter/README.md) stages. Separate benchmark tests are also provided to measure performance of the example [Production DFP](../../examples/digital_fingerprinting/production/README.md) pipelines. More information about running those benchmarks can be found [here](../../examples/digital_fingerprinting/production/morpheus/benchmarks/README.md). +Separate benchmark tests are provided to measure performance of the example [Production DFP](../../examples/digital_fingerprinting/production/README.md) pipelines. More information about running those benchmarks can be found [here](../../examples/digital_fingerprinting/production/morpheus/benchmarks/README.md). You can use the same Dev container created here to run the Production DFP benchmarks. You would just need to install additional dependencies as follows: diff --git a/tests/benchmarks/e2e_test_configs.json b/tests/benchmarks/e2e_test_configs.json index eae85c1deb..83449a5517 100644 --- a/tests/benchmarks/e2e_test_configs.json +++ b/tests/benchmarks/e2e_test_configs.json @@ -26,15 +26,5 @@ "model_max_batch_size": 64, "feature_length": 128, "edge_buffer_size": 4 - }, - "test_cloudtrail_ae_e2e": { - "input_glob_path": "../../models/datasets/validation-data/dfp-cloudtrail-*-input.csv", - "train_glob_path": "../../models/datasets/training-data/dfp-cloudtrail-*-training-data.csv", - "repeat": 1, - "num_threads": 1, - "pipeline_batch_size": 1024, - "model_max_batch_size": 1024, - "feature_length": 32, - "edge_buffer_size": 4 } -} \ No newline at end of file +} diff --git a/tests/benchmarks/test_bench_e2e_pipelines.py b/tests/benchmarks/test_bench_e2e_pipelines.py index e99e7bbc07..b9f6880d3e 100644 --- a/tests/benchmarks/test_bench_e2e_pipelines.py +++ b/tests/benchmarks/test_bench_e2e_pipelines.py @@ -21,25 +21,19 @@ from _utils import TEST_DIRS from morpheus.config import Config -from morpheus.config import ConfigAutoEncoder from morpheus.config import ConfigFIL from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage -from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage from morpheus.stages.inference.triton_inference_stage import TritonInferenceStage -from morpheus.stages.input.cloud_trail_source_stage import CloudTrailSourceStage from morpheus.stages.input.file_source_stage import FileSourceStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage -from morpheus.stages.postprocess.add_scores_stage import AddScoresStage from morpheus.stages.postprocess.serialize_stage import SerializeStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage -from morpheus.stages.preprocess.preprocess_ae_stage import PreprocessAEStage from morpheus.stages.preprocess.preprocess_fil_stage import PreprocessFILStage from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage -from morpheus.stages.preprocess.train_ae_stage import TrainAEStage from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging @@ -97,28 +91,6 @@ def fil_pipeline(config: Config, input_file, repeat, output_file, model_name): pipeline.run() -def ae_pipeline(config: Config, input_glob, repeat, train_data_glob, output_file): - - configure_logging(log_level=logging.INFO) - pipeline = LinearPipeline(config) - pipeline.set_source(CloudTrailSourceStage(config, input_glob=input_glob, max_files=200, repeat=repeat)) - pipeline.add_stage( - TrainAEStage(config, - train_data_glob=train_data_glob, - source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - seed=42, - sort_glob=True)) - pipeline.add_stage(PreprocessAEStage(config)) - pipeline.add_stage(AutoEncoderInferenceStage(config)) - pipeline.add_stage(AddScoresStage(config)) - pipeline.add_stage(MonitorStage(config, log_level=logging.INFO)) - pipeline.add_stage(SerializeStage(config)) - pipeline.add_stage(WriteToFileStage(config, filename=output_file, overwrite=True)) - - pipeline.build() - pipeline.run() - - @pytest.mark.benchmark def test_sid_nlp_e2e(benchmark, tmp_path): @@ -196,30 +168,3 @@ def test_phishing_nlp_e2e(benchmark, tmp_path): model_name = "phishing-bert-onnx" benchmark(nlp_pipeline, config, input_filepath, repeat, vocab_filepath, output_filepath, model_name) - - -@pytest.mark.benchmark -def test_cloudtrail_ae_e2e(benchmark, tmp_path): - - config = Config() - config.mode = PipelineModes.AE - config.num_threads = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["num_threads"] - config.pipeline_batch_size = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["pipeline_batch_size"] - config.model_max_batch_size = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["model_max_batch_size"] - config.feature_length = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["feature_length"] - config.edge_buffer_size = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["edge_buffer_size"] - config.class_labels = ["reconstruct_loss", "zscore"] - - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = "userIdentityaccountId" - config.ae.userid_filter = "Account-123456789" - ae_cols_filepath = os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt') - config.ae.feature_columns = load_labels_file(ae_cols_filepath) - CppConfig.set_should_use_cpp(False) - - input_glob = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["input_glob_path"] - repeat = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["repeat"] - train_glob = E2E_TEST_CONFIGS["test_cloudtrail_ae_e2e"]["train_glob_path"] - output_filepath = os.path.join(tmp_path, "cloudtrail_ae_e2e_output.csv") - - benchmark(ae_pipeline, config, input_glob, repeat, train_glob, output_filepath) diff --git a/tests/stages/test_preprocess_ae_stage.py b/tests/stages/test_preprocess_ae_stage.py deleted file mode 100644 index 5202361b41..0000000000 --- a/tests/stages/test_preprocess_ae_stage.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing - -import cupy as cp -import pytest -import typing_utils - -import cudf - -from morpheus.config import Config -from morpheus.config import ConfigAutoEncoder -from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta -from morpheus.stages.preprocess.preprocess_ae_stage import PreprocessAEStage - - -@pytest.fixture(name='config') -def fixture_config(config: Config): - config.feature_length = 256 - config.ae = ConfigAutoEncoder() - config.ae.feature_columns = ["data"] - yield config - - -def test_constructor(config: Config): - stage = PreprocessAEStage(config) - assert stage.name == "preprocess-ae" - - accepted_union = typing.Union[stage.accepted_types()] - assert typing_utils.issubtype(ControlMessage, accepted_union) - - -def test_process_control_message(config: Config): - stage = PreprocessAEStage(config) - - df = cudf.DataFrame({"data": ["a", "b", "c"]}) - meta = MessageMeta(df) - - input_control_message = ControlMessage() - input_control_message.payload(meta) - - output_control_message = stage.pre_process_batch(input_control_message, fea_len=256, feature_columns=["data"]) - - expected_input = cp.zeros(df.shape, dtype=cp.float32) - assert cp.array_equal(output_control_message.tensors().get_tensor("input"), expected_input) - - expect_seq_ids = cp.zeros((df.shape[0], 3), dtype=cp.uint32) - expect_seq_ids[:, 0] = cp.arange(0, df.shape[0], dtype=cp.uint32) - expect_seq_ids[:, 2] = stage._fea_length - 1 - assert cp.array_equal(output_control_message.tensors().get_tensor("seq_ids"), expect_seq_ids) diff --git a/tests/test_cli.py b/tests/test_cli.py index 1f578e5990..d97a94c8f2 100755 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -32,15 +32,11 @@ from morpheus.common import FileTypes from morpheus.common import FilterSource from morpheus.config import Config -from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.stages.general.monitor_stage import MonitorStage -from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage from morpheus.stages.inference.identity_inference_stage import IdentityInferenceStage from morpheus.stages.inference.pytorch_inference_stage import PyTorchInferenceStage from morpheus.stages.inference.triton_inference_stage import TritonInferenceStage -from morpheus.stages.input.cloud_trail_source_stage import CloudTrailSourceStage from morpheus.stages.input.file_source_stage import FileSourceStage from morpheus.stages.input.kafka_source_stage import KafkaSourceStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -50,14 +46,11 @@ from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage from morpheus.stages.postprocess.ml_flow_drift_stage import MLFlowDriftStage from morpheus.stages.postprocess.serialize_stage import SerializeStage -from morpheus.stages.postprocess.timeseries_stage import TimeSeriesStage from morpheus.stages.postprocess.validation_stage import ValidationStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage from morpheus.stages.preprocess.drop_null_stage import DropNullStage -from morpheus.stages.preprocess.preprocess_ae_stage import PreprocessAEStage from morpheus.stages.preprocess.preprocess_fil_stage import PreprocessFILStage from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage -from morpheus.stages.preprocess.train_ae_stage import TrainAEStage from morpheus.utils.file_utils import load_labels_file GENERAL_ARGS = ['run', '--num_threads=12', '--pipeline_batch_size=1024', '--model_max_batch_size=1024', '--use_cpp=0'] @@ -144,9 +137,8 @@ def config_warning_fixture(): @pytest.mark.use_python class TestCLI: - @pytest.mark.parametrize('cmd', - [[], ['tools'], ['run'], ['run', 'pipeline-ae'], ['run', 'pipeline-fil'], - ['run', 'pipeline-nlp'], ['run', 'pipeline-other']]) + @pytest.mark.parametrize( + 'cmd', [[], ['tools'], ['run'], ['run', 'pipeline-fil'], ['run', 'pipeline-nlp'], ['run', 'pipeline-other']]) def test_help(self, cmd: list[str]): runner = CliRunner() result = runner.invoke(commands.cli, cmd + ['--help']) @@ -177,187 +169,6 @@ def test_manual_seed(self, mock_manual_seed: mock.MagicMock, value: int, use_env assert result.exit_code == 0, result.output mock_manual_seed.assert_called_once_with(value) - @pytest.mark.replace_callback('pipeline_ae') - def test_pipeline_ae(self, config, callback_values): - """ - Build a pipeline roughly ressembles the DFP validation script - """ - args = (GENERAL_ARGS + [ - 'pipeline-ae', - '--columns_file=data/columns_ae_cloudtrail.txt', - '--userid_filter=user321', - '--userid_column_name=user_col', - 'from-cloudtrail', - '--input_glob=input_glob*.csv', - 'train-ae', - '--train_data_glob=train_glob*.csv', - '--seed', - '47', - 'preprocess', - 'inf-pytorch', - 'add-scores', - 'timeseries', - '--resolution=1m', - '--zscore_threshold=8.0', - '--hot_start' - ] + MONITOR_ARGS + VALIDATE_ARGS + ['serialize'] + TO_FILE_ARGS) - - obj = {} - runner = CliRunner() - result = runner.invoke(commands.cli, args, obj=obj) - assert result.exit_code == 47, result.output - - # Ensure our config is populated correctly - - config = obj["config"] - assert config.mode == PipelineModes.AE - assert not CppConfig.get_should_use_cpp() - assert config.class_labels == ["reconstruct_loss", "zscore"] - assert config.model_max_batch_size == 1024 - assert config.pipeline_batch_size == 1024 - assert config.num_threads == 12 - - assert isinstance(config.ae, ConfigAutoEncoder) - config.ae.userid_column_name = "user_col" - config.ae.userid_filter = "user321" - - expected_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt')) - assert config.ae.feature_columns == expected_columns - - pipe = callback_values['pipe'] - assert pipe is not None - - stages = callback_values['stages'] - # Verify the stages are as we expect them, if there is a size-mismatch python will raise a Value error - [cloud_trail, train_ae, process_ae, auto_enc, add_scores, time_series, monitor, validation, serialize, - to_file] = stages - - assert isinstance(cloud_trail, CloudTrailSourceStage) - assert cloud_trail._watcher._input_glob == "input_glob*.csv" - - assert isinstance(train_ae, TrainAEStage) - assert train_ae._train_data_glob == "train_glob*.csv" - assert train_ae._seed == 47 - - assert isinstance(process_ae, PreprocessAEStage) - assert isinstance(auto_enc, AutoEncoderInferenceStage) - assert isinstance(add_scores, AddScoresStage) - - assert isinstance(time_series, TimeSeriesStage) - assert time_series._resolution == '1m' - assert time_series._zscore_threshold == 8.0 - assert time_series._hot_start - - assert isinstance(monitor, MonitorStage) - assert monitor._mc._description == 'Unittest' - assert monitor._mc._smoothing == 0.001 - assert monitor._mc._unit == 'inf' - - assert isinstance(validation, ValidationStage) - assert validation._results_file_name == 'results.json' - assert validation._index_col == '_index_' - - # Click appears to be converting this into a tuple - assert list(validation._exclude_columns) == ['event_dt'] - assert validation._rel_tol == 0.1 - - assert isinstance(serialize, SerializeStage) - - assert isinstance(to_file, WriteToFileStage) - assert to_file._controller._output_file == 'out.csv' - - @pytest.mark.replace_callback('pipeline_ae') - def test_pipeline_ae_all(self, callback_values): - """ - Attempt to add all possible stages to the pipeline_ae, even if the pipeline doesn't - actually make sense, just test that cli could assemble it - """ - args = (GENERAL_ARGS + [ - 'pipeline-ae', - '--columns_file=data/columns_ae_cloudtrail.txt', - '--userid_filter=user321', - '--userid_column_name=user_col', - 'from-cloudtrail', - '--input_glob=input_glob*.csv', - 'add-class', - 'unittest-conv-msg', - 'filter', - 'train-ae', - '--train_data_glob=train_glob*.csv', - '--seed', - '47', - 'preprocess', - 'inf-pytorch', - 'add-scores' - ] + ['timeseries', '--resolution=1m', '--zscore_threshold=8.0', '--hot_start'] + MONITOR_ARGS + VALIDATE_ARGS + - ['serialize'] + TO_FILE_ARGS + TO_KAFKA_ARGS) - - runner = CliRunner() - result = runner.invoke(commands.cli, args) - - assert result.exit_code == 47, result.output - - stages = callback_values['stages'] - # Verify the stages are as we expect them, if there is a size-mismatch python will raise a Value error - [ - cloud_trail, - add_class, - conv_msg, - filter_stage, - train_ae, - process_ae, - auto_enc, - add_scores, - time_series, - monitor, - validation, - serialize, - to_file, - to_kafka - ] = stages - - assert isinstance(cloud_trail, CloudTrailSourceStage) - assert cloud_trail._watcher._input_glob == "input_glob*.csv" - - assert isinstance(add_class, AddClassificationsStage) - assert isinstance(conv_msg, ConvMsg) - assert isinstance(filter_stage, FilterDetectionsStage) - - assert isinstance(train_ae, TrainAEStage) - assert train_ae._train_data_glob == "train_glob*.csv" - assert train_ae._seed == 47 - - assert isinstance(process_ae, PreprocessAEStage) - assert isinstance(auto_enc, AutoEncoderInferenceStage) - assert isinstance(add_scores, AddScoresStage) - - assert isinstance(time_series, TimeSeriesStage) - assert time_series._resolution == '1m' - assert time_series._zscore_threshold == 8.0 - assert time_series._hot_start - - assert isinstance(monitor, MonitorStage) - assert monitor._mc._description == 'Unittest' - assert monitor._mc._smoothing == 0.001 - assert monitor._mc._unit == 'inf' - - assert isinstance(validation, ValidationStage) - assert validation._results_file_name == 'results.json' - assert validation._index_col == '_index_' - - # Click appears to be converting this into a tuple - assert list(validation._exclude_columns) == ['event_dt'] - assert validation._rel_tol == 0.1 - - assert isinstance(serialize, SerializeStage) - - assert isinstance(to_file, WriteToFileStage) - assert to_file._controller._output_file == 'out.csv' - - assert isinstance(to_kafka, WriteToKafkaStage) - assert to_kafka._kafka_conf['bootstrap.servers'] == 'kserv1:123,kserv2:321' - assert to_kafka._output_topic == 'test_topic' - @pytest.mark.replace_callback('pipeline_fil') def test_pipeline_fil(self, config, callback_values): """ @@ -1028,64 +839,3 @@ def test_pipeline_fil_relative_path_precedence(self, config: Config, tmp_path: s assert config.class_labels == test_labels assert config.fil.feature_columns == test_columns - - # pylint: disable=unused-argument - @pytest.mark.replace_callback('pipeline_ae') - def test_pipeline_ae_relative_path_precedence(self, config: Config, tmp_path: str, callback_values: dict): - """ - Ensure that relative paths are choosen over the morpheus data directory paths - """ - - labels_file = "data/labels_ae.txt" - columns_file = "data/columns_ae_cloudtrail.txt" - - labels_file_local = os.path.join(tmp_path, labels_file) - columns_file_local = os.path.join(tmp_path, columns_file) - - os.makedirs(os.path.join(tmp_path, "data"), exist_ok=True) - - # Use different labels - test_labels = ["label1"] - - # Overwrite the copied labels - with open(labels_file_local, mode="w", encoding='UTF-8') as f: - f.writelines("\n".join(test_labels)) - - # Use different labels - test_columns = [f"column{i}" for i in range(33)] - - # Overwrite the copied labels - with open(columns_file_local, mode="w", encoding='UTF-8') as f: - f.writelines("\n".join(test_columns)) - - args = (GENERAL_ARGS + [ - 'pipeline-ae', - '--userid_filter=user321', - '--userid_column_name=user_col', - f"--labels_file={labels_file}", - f"--columns_file={columns_file}", - 'from-cloudtrail', - '--input_glob=input_glob*.csv', - 'train-ae', - '--train_data_glob=train_glob*.csv', - '--seed', - '47', - 'preprocess', - 'inf-pytorch', - 'add-scores', - 'timeseries', - '--resolution=1m', - '--zscore_threshold=8.0', - '--hot_start' - ] + MONITOR_ARGS + VALIDATE_ARGS + ['serialize'] + TO_FILE_ARGS) - - obj = {} - runner = CliRunner() - result = runner.invoke(commands.cli, args, obj=obj) - assert result.exit_code == 47, result.output - - # Ensure our config is populated correctly - config = obj["config"] - assert config.class_labels == test_labels - - assert config.ae.feature_columns == test_columns diff --git a/tests/test_dfp.py b/tests/test_dfp.py deleted file mode 100755 index 521509369d..0000000000 --- a/tests/test_dfp.py +++ /dev/null @@ -1,304 +0,0 @@ -#!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from unittest import mock - -import numpy as np -import pandas as pd -import pytest - -from _utils import TEST_DIRS -from _utils import calc_error_val -from morpheus.config import Config -from morpheus.config import ConfigAutoEncoder -from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import MessageMeta -from morpheus.messages.message_meta import UserMessageMeta -from morpheus.pipeline import LinearPipeline -from morpheus.stages.general.monitor_stage import MonitorStage -from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage -from morpheus.stages.input.cloud_trail_source_stage import CloudTrailSourceStage -from morpheus.stages.output.write_to_file_stage import WriteToFileStage -from morpheus.stages.postprocess.add_scores_stage import AddScoresStage -from morpheus.stages.postprocess.serialize_stage import SerializeStage -from morpheus.stages.postprocess.timeseries_stage import TimeSeriesStage -from morpheus.stages.postprocess.validation_stage import ValidationStage -from morpheus.stages.preprocess import preprocess_ae_stage -from morpheus.stages.preprocess import train_ae_stage - -# End-to-end test intended to imitate the DFP validation test - - -@pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) -@pytest.mark.usefixtures("reload_modules") -@mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_roleg(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): - tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_tensor.csv'), delimiter=',') - anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_anomaly_score.csv'), delimiter=',') - exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_exp_results.csv')) - - mock_input_tensor = mock.MagicMock() - mock_input_tensor.return_value = mock_input_tensor - mock_input_tensor.detach.return_value = tensor_data - - mock_ae.return_value = mock_ae - mock_ae.build_input_tensor.return_value = mock_input_tensor - mock_ae.get_anomaly_score.return_value = anomaly_score - mock_ae.get_results.return_value = exp_results - - config.mode = PipelineModes.AE - config.class_labels = ["reconstruct_loss", "zscore"] - config.model_max_batch_size = 1024 - config.pipeline_batch_size = 1024 - config.feature_length = 256 - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = "userIdentitysessionContextsessionIssueruserName" - config.ae.userid_filter = "role-g" - config.ae.timestamp_column_name = "event_dt" - - with open(os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt'), encoding='UTF-8') as fh: - config.ae.feature_columns = [x.strip() for x in fh.readlines()] - - input_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - train_data_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - - out_file = os.path.join(tmp_path, 'results.csv') - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'dfp-cloudtrail-role-g-validation-data-output.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_stage( - train_ae_stage.TrainAEStage( - config, - train_data_glob=train_data_glob, - source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - seed=42, - sort_glob=True)) - pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) - pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) - pipe.add_stage( - TimeSeriesStage(config, - resolution="1m", - min_window="12 h", - hot_start=True, - cold_end=False, - filter_percent=90.0, - zscore_threshold=8.0)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage( - ValidationStage(config, - val_file_name=val_file_name, - results_file_name=results_file_name, - index_col="_index_", - exclude=("event_dt", "zscore"), - rel_tol=0.1)) - - pipe.add_stage(SerializeStage(config, include=[])) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - - mock_ae.fit.assert_called_once() - mock_ae.build_input_tensor.assert_called_once() - mock_ae.get_anomaly_score.assert_called() - mock_ae.get_results.assert_called_once() - - results = calc_error_val(results_file_name) - assert results.diff_rows == 0 - - -@pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) -@pytest.mark.usefixtures("reload_modules") -@mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_user123(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): - tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_tensor.csv'), delimiter=',') - anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_anomaly_score.csv'), delimiter=',') - exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_exp_results.csv')) - - mock_input_tensor = mock.MagicMock() - mock_input_tensor.return_value = mock_input_tensor - mock_input_tensor.detach.return_value = tensor_data - - mock_ae.return_value = mock_ae - mock_ae.build_input_tensor.return_value = mock_input_tensor - mock_ae.get_anomaly_score.return_value = anomaly_score - mock_ae.get_results.return_value = exp_results - - config.mode = PipelineModes.AE - config.class_labels = ["reconstruct_loss", "zscore"] - config.model_max_batch_size = 1024 - config.pipeline_batch_size = 1024 - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = "userIdentitysessionContextsessionIssueruserName" - config.ae.userid_filter = "user123" - config.ae.timestamp_column_name = "event_dt" - - with open(os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt'), encoding='UTF-8') as fh: - config.ae.feature_columns = [x.strip() for x in fh.readlines()] - - input_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - train_data_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - out_file = os.path.join(tmp_path, 'results.csv') - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'dfp-cloudtrail-user123-validation-data-output.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_stage( - train_ae_stage.TrainAEStage( - config, - train_data_glob=train_data_glob, - source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - seed=42, - sort_glob=True)) - pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) - pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) - pipe.add_stage( - TimeSeriesStage(config, - resolution="1m", - min_window="12 h", - hot_start=True, - cold_end=False, - filter_percent=90.0, - zscore_threshold=8.0)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage( - ValidationStage(config, - val_file_name=val_file_name, - results_file_name=results_file_name, - index_col="_index_", - exclude=("event_dt", "zscore"), - rel_tol=0.1)) - pipe.add_stage(SerializeStage(config, include=[])) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - - mock_ae.fit.assert_called_once() - mock_ae.build_input_tensor.assert_called_once() - mock_ae.get_anomaly_score.assert_called() - mock_ae.get_results.assert_called_once() - - results = calc_error_val(results_file_name) - assert results.diff_rows == 0 - - -@pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) -@pytest.mark.usefixtures("reload_modules") -@mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_user123_multi_segment(mock_ae: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): - tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_tensor.csv'), delimiter=',') - anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_anomaly_score.csv'), delimiter=',') - exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_exp_results.csv')) - - mock_input_tensor = mock.MagicMock() - mock_input_tensor.return_value = mock_input_tensor - mock_input_tensor.detach.return_value = tensor_data - - mock_ae.return_value = mock_ae - mock_ae.build_input_tensor.return_value = mock_input_tensor - mock_ae.get_anomaly_score.return_value = anomaly_score - mock_ae.get_results.return_value = exp_results - - config.mode = PipelineModes.AE - config.class_labels = ["reconstruct_loss", "zscore"] - config.model_max_batch_size = 1024 - config.pipeline_batch_size = 1024 - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = "userIdentitysessionContextsessionIssueruserName" - config.ae.userid_filter = "user123" - config.ae.timestamp_column_name = "event_dt" - - with open(os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt'), encoding='UTF-8') as fh: - config.ae.feature_columns = [x.strip() for x in fh.readlines()] - - input_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - train_data_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - out_file = os.path.join(tmp_path, 'results.csv') - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'dfp-cloudtrail-user123-validation-data-output.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_segment_boundary(UserMessageMeta) # Boundary 1 - pipe.add_stage( - train_ae_stage.TrainAEStage( - config, - train_data_glob=train_data_glob, - source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - seed=42, - sort_glob=True)) - pipe.add_segment_boundary(ControlMessage) # Boundary 2 - pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) - pipe.add_segment_boundary(ControlMessage) # Boundary 3 - pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_segment_boundary(ControlMessage) # Boundary 4 - pipe.add_stage(AddScoresStage(config)) - pipe.add_segment_boundary(ControlMessage) # Boundary 5 - pipe.add_stage( - TimeSeriesStage(config, - resolution="1m", - min_window="12 h", - hot_start=True, - cold_end=False, - filter_percent=90.0, - zscore_threshold=8.0)) - pipe.add_segment_boundary(ControlMessage) # Boundary 6 - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage( - ValidationStage(config, - val_file_name=val_file_name, - results_file_name=results_file_name, - index_col="_index_", - exclude=("event_dt", "zscore"), - rel_tol=0.1)) - pipe.add_segment_boundary(ControlMessage) # Boundary 7 - pipe.add_stage(SerializeStage(config, include=[])) - pipe.add_segment_boundary(MessageMeta) # Boundary 8 - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - - mock_ae.fit.assert_called_once() - mock_ae.build_input_tensor.assert_called_once() - mock_ae.get_anomaly_score.assert_called() - mock_ae.get_results.assert_called_once() - - results = calc_error_val(results_file_name) - assert results.diff_rows == 0 diff --git a/tests/test_dfp_kafka.py b/tests/test_dfp_kafka.py deleted file mode 100755 index d952b00ae7..0000000000 --- a/tests/test_dfp_kafka.py +++ /dev/null @@ -1,255 +0,0 @@ -#!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import typing -from io import StringIO -from unittest import mock - -import numpy as np -import pandas as pd -import pytest - -from _utils import TEST_DIRS -from _utils.dataset_manager import DatasetManager -from _utils.kafka import KafkaTopics -from morpheus.cli import commands -from morpheus.config import Config -from morpheus.config import ConfigAutoEncoder -from morpheus.config import PipelineModes -from morpheus.io.utils import filter_null_data -from morpheus.pipeline import LinearPipeline -from morpheus.stages.general.monitor_stage import MonitorStage -from morpheus.stages.inference.auto_encoder_inference_stage import AutoEncoderInferenceStage -from morpheus.stages.input.cloud_trail_source_stage import CloudTrailSourceStage -from morpheus.stages.output.write_to_kafka_stage import WriteToKafkaStage -from morpheus.stages.postprocess.add_scores_stage import AddScoresStage -from morpheus.stages.postprocess.serialize_stage import SerializeStage -from morpheus.stages.postprocess.timeseries_stage import TimeSeriesStage -from morpheus.stages.preprocess import preprocess_ae_stage -from morpheus.stages.preprocess import train_ae_stage -from morpheus.utils.compare_df import compare_df -from morpheus.utils.file_utils import load_labels_file - -if (typing.TYPE_CHECKING): - from kafka import KafkaConsumer - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.reload_modules([commands, preprocess_ae_stage, train_ae_stage]) -@pytest.mark.usefixtures("reload_modules", "loglevel_debug") -@mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_roleg(mock_ae: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_tensor.csv'), delimiter=',') - anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_anomaly_score.csv'), delimiter=',') - exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_roleg_exp_results.csv')) - - mock_input_tensor = mock.MagicMock() - mock_input_tensor.return_value = mock_input_tensor - mock_input_tensor.detach.return_value = tensor_data - - mock_ae.return_value = mock_ae - mock_ae.build_input_tensor.return_value = mock_input_tensor - mock_ae.get_anomaly_score.return_value = anomaly_score - mock_ae.get_results.return_value = exp_results - - config.mode = PipelineModes.AE - config.class_labels = ["reconstruct_loss", "zscore"] - config.model_max_batch_size = 1024 - config.pipeline_batch_size = 1024 - config.feature_length = 256 - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = "userIdentitysessionContextsessionIssueruserName" - config.ae.userid_filter = "role-g" - config.ae.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt')) - config.ae.timestamp_column_name = "event_dt" - - input_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - train_data_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'dfp-cloudtrail-role-g-validation-data-output.csv') - - pipe = LinearPipeline(config) - pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_stage( - train_ae_stage.TrainAEStage( - config, - train_data_glob=train_data_glob, - source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - seed=42, - sort_glob=True)) - pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) - pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) - pipe.add_stage( - TimeSeriesStage(config, - resolution="1m", - min_window="12 h", - hot_start=True, - cold_end=False, - filter_percent=90.0, - zscore_threshold=8.0)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(SerializeStage(config, include=[])) - pipe.add_stage( - WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) - - pipe.run() - - mock_ae.fit.assert_called_once() - mock_ae.build_input_tensor.assert_called_once() - mock_ae.get_anomaly_score.assert_called() - mock_ae.get_results.assert_called_once() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f'{rec.value.decode("utf-8")}\n') - - output_buf.seek(0) - output_df = pd.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == len(val_df) - - results = compare_df( - val_df, - output_df, - replace_idx="_index_", - exclude_columns=[ - 'event_dt', - 'zscore', - 'userAgent' # userAgent in output_df includes escape chars in the string - ], - rel_tol=0.15, - show_report=True) - - assert results['diff_rows'] == 0 - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.reload_modules([preprocess_ae_stage, train_ae_stage]) -@pytest.mark.usefixtures("reload_modules", "loglevel_debug") -@mock.patch('morpheus.stages.preprocess.train_ae_stage.AutoEncoder') -def test_dfp_user123(mock_ae: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - tensor_data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_tensor.csv'), delimiter=',') - anomaly_score = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_anomaly_score.csv'), delimiter=',') - exp_results = pd.read_csv(os.path.join(TEST_DIRS.tests_data_dir, 'dfp_user123_exp_results.csv')) - - mock_input_tensor = mock.MagicMock() - mock_input_tensor.return_value = mock_input_tensor - mock_input_tensor.detach.return_value = tensor_data - - mock_ae.return_value = mock_ae - mock_ae.build_input_tensor.return_value = mock_input_tensor - mock_ae.get_anomaly_score.return_value = anomaly_score - mock_ae.get_results.return_value = exp_results - - config.mode = PipelineModes.AE - config.class_labels = ["reconstruct_loss", "zscore"] - config.model_max_batch_size = 1024 - config.pipeline_batch_size = 1024 - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.ae = ConfigAutoEncoder() - config.ae.userid_column_name = "userIdentitysessionContextsessionIssueruserName" - config.ae.userid_filter = "user123" - config.ae.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_ae_cloudtrail.txt')) - config.ae.timestamp_column_name = "event_dt" - - input_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - train_data_glob = os.path.join(TEST_DIRS.validation_data_dir, "dfp-cloudtrail-*-input.csv") - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'dfp-cloudtrail-user123-validation-data-output.csv') - - pipe = LinearPipeline(config) - pipe.set_source(CloudTrailSourceStage(config, input_glob=input_glob, sort_glob=True)) - pipe.add_stage( - train_ae_stage.TrainAEStage( - config, - train_data_glob=train_data_glob, - source_stage_class="morpheus.stages.input.cloud_trail_source_stage.CloudTrailSourceStage", - seed=42, - sort_glob=True)) - pipe.add_stage(preprocess_ae_stage.PreprocessAEStage(config)) - pipe.add_stage(AutoEncoderInferenceStage(config)) - pipe.add_stage(AddScoresStage(config)) - pipe.add_stage( - TimeSeriesStage(config, - resolution="1m", - min_window="12 h", - hot_start=True, - cold_end=False, - filter_percent=90.0, - zscore_threshold=8.0)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(SerializeStage(config, include=[])) - pipe.add_stage( - WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) - - pipe.run() - - mock_ae.fit.assert_called_once() - mock_ae.build_input_tensor.assert_called_once() - mock_ae.get_anomaly_score.assert_called() - mock_ae.get_results.assert_called_once() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f'{rec.value.decode("utf-8")}\n') - - output_buf.seek(0) - output_df = pd.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == len(val_df) - - results = compare_df( - val_df, - output_df, - replace_idx="_index_", - exclude_columns=[ - 'event_dt', - 'zscore', - 'userAgent' # userAgent in output_df includes escape chars in the string - ], - rel_tol=0.1, - show_report=True) - - assert results['diff_rows'] == 0 diff --git a/tests/tests_data/dfp_roleg_anomaly_score.csv b/tests/tests_data/dfp_roleg_anomaly_score.csv deleted file mode 100644 index 00edc1dc37..0000000000 --- a/tests/tests_data/dfp_roleg_anomaly_score.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12394244db00812eeeb3ca3b5632f3fcde19108aca4fa13d28fccdf68ed64851 -size 5770 diff --git a/tests/tests_data/dfp_roleg_exp_results.csv b/tests/tests_data/dfp_roleg_exp_results.csv deleted file mode 100644 index 622a2ed1d6..0000000000 --- a/tests/tests_data/dfp_roleg_exp_results.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:845c961529eb37822bb3e12c02584d6a7b781217bd6f2156aa3429a7cd982821 -size 100646 diff --git a/tests/tests_data/dfp_roleg_tensor.csv b/tests/tests_data/dfp_roleg_tensor.csv deleted file mode 100644 index 7de2fdb416..0000000000 --- a/tests/tests_data/dfp_roleg_tensor.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1bfd2ebcda75e56571066c4bd220a393587763890a29ae98be65cdf58bdaa99c -size 2063055 diff --git a/tests/tests_data/dfp_user123_anomaly_score.csv b/tests/tests_data/dfp_user123_anomaly_score.csv deleted file mode 100644 index 6a0427f5b7..0000000000 --- a/tests/tests_data/dfp_user123_anomaly_score.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91b4ea4de9c51ca459983383e8583f11edc116537da29e695b4976bcc61df462 -size 15657 diff --git a/tests/tests_data/dfp_user123_exp_results.csv b/tests/tests_data/dfp_user123_exp_results.csv deleted file mode 100644 index ad31dfb805..0000000000 --- a/tests/tests_data/dfp_user123_exp_results.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:63d7188d4ec5f0d6d7f7979d423663190e1934b1d19ea5eca3537c98daa0de28 -size 270285 diff --git a/tests/tests_data/dfp_user123_tensor.csv b/tests/tests_data/dfp_user123_tensor.csv deleted file mode 100644 index 62ddfdab4d..0000000000 --- a/tests/tests_data/dfp_user123_tensor.csv +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61d09187fb624696a4c248a4222caa7c13df5c3399d967659478809c16628228 -size 1636032 From c175aa06014b4e7f2be0b6b7d3e727e2c5cfd498 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Tue, 24 Sep 2024 11:31:05 -0400 Subject: [PATCH 2/7] Remove reference to removed pipeline-ae example --- docs/source/cloud_deployment_guide.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/source/cloud_deployment_guide.md b/docs/source/cloud_deployment_guide.md index 31641c365a..3d7da855c7 100644 --- a/docs/source/cloud_deployment_guide.md +++ b/docs/source/cloud_deployment_guide.md @@ -382,10 +382,9 @@ kubectl -n $NAMESPACE exec deploy/broker -c broker -- kafka-topics.sh \ This section describes example workflows to run on Morpheus. Four sample pipelines are provided. -1. AutoEncoder pipeline performing Digital Fingerprinting (DFP). -2. NLP pipeline performing Phishing Detection (PD). -3. NLP pipeline performing Sensitive Information Detection (SID). -4. FIL pipeline performing Anomalous Behavior Profiling (ABP). +1. NLP pipeline performing Phishing Detection (PD). +2. NLP pipeline performing Sensitive Information Detection (SID). +3. FIL pipeline performing Anomalous Behavior Profiling (ABP). Multiple command options are given for each pipeline, with varying data input/output methods, ranging from local files to Kafka Topics. From ab67c075cad5c1edff84de1f917f0b4d3867fea2 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Fri, 18 Oct 2024 12:32:05 -0400 Subject: [PATCH 3/7] flake8 fix --- python/morpheus/morpheus/cli/commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/morpheus/morpheus/cli/commands.py b/python/morpheus/morpheus/cli/commands.py index 01bc69ab91..1b885ecd81 100644 --- a/python/morpheus/morpheus/cli/commands.py +++ b/python/morpheus/morpheus/cli/commands.py @@ -27,7 +27,6 @@ from morpheus.cli.utils import get_config_from_ctx from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import get_pipeline_from_ctx -from morpheus.cli.utils import parse_enum from morpheus.cli.utils import parse_log_level from morpheus.cli.utils import prepare_command from morpheus.config import Config From fb87f344a55e19e47811072d47f612e89426b20e Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Fri, 18 Oct 2024 13:38:41 -0400 Subject: [PATCH 4/7] Add link to linkcheck_ignore to fix ci doc stage failure --- docs/source/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/conf.py b/docs/source/conf.py index aa59786e26..d743df18e1 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -199,6 +199,7 @@ r'^http://$', r'^https://$', r'https://(platform\.)?openai.com', + r'https://code.visualstudio.com' ] # Add any paths that contain templates here, relative to this directory. From 2c1923767e3b82b6cf171ec4daee1f7df3a6b895 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Fri, 18 Oct 2024 22:01:12 -0400 Subject: [PATCH 5/7] Merge branch 'branch-24.10' of https://github.com/nv-morpheus/Morpheus into remove-starter-dfp --- .../config/vocabularies/morpheus/accept.txt | 5 + .../examples_cuda-125_arch-x86_64.yaml | 1 + dependencies.yaml | 7 + docker/run_container.sh | 57 ++++ docker/run_container_dev.sh | 38 +-- docker/run_container_release.sh | 39 +-- docs/README.md | 10 +- docs/source/basics/building_a_pipeline.md | 10 +- docs/source/basics/overview.rst | 15 +- docs/source/cloud_deployment_guide.md | 12 - docs/source/developer_guide/contributing.md | 91 ++++-- ...modular_pipeline_digital_fingerprinting.md | 6 - .../guides/1_simple_python_stage.md | 39 ++- .../guides/2_real_world_phishing.md | 71 +++-- .../guides/3_simple_cpp_stage.md | 23 +- .../guides/4_source_cpp_stage.md | 7 +- .../guides/5_digital_fingerprinting.md | 7 +- .../6_digital_fingerprinting_reference.md | 13 +- .../guides/9_control_messages.md | 36 +-- examples/abp_nvsmi_detection/README.md | 5 +- .../abp_pcap_preprocessing.py | 8 +- examples/cpu_only/README.md | 72 +++++ examples/cpu_only/run.py | 140 ++++++++++ .../1_simple_python_stage/pass_thru.py | 3 +- .../1_simple_python_stage/pass_thru_deco.py | 3 +- .../2_1_real_world_phishing/run.py | 2 +- .../developer_guide/2_2_rabbitmq/README.md | 2 + .../2_2_rabbitmq/rabbitmq_source_stage.py | 11 +- .../rabbitmq_source_stage_deco.py | 10 +- .../2_2_rabbitmq/read_simple.py | 13 +- .../2_2_rabbitmq/write_simple.py | 21 +- .../2_2_rabbitmq/write_to_rabbitmq_stage.py | 3 +- .../src/simple_cpp_stage/pass_thru.py | 3 +- .../4_rabbitmq_cpp_stage/README.md | 2 - .../rabbitmq_source_stage.py | 7 +- .../write_to_rabbitmq_stage.py | 3 +- .../4_rabbitmq_cpp_stage/src/read_simple.py | 9 +- .../4_rabbitmq_cpp_stage/src/write_simple.py | 21 +- .../production/dfp_azure_pipeline.py | 4 - .../production/dfp_duo_pipeline.py | 4 - .../dfp_integrated_training_batch_pipeline.py | 9 +- ..._integrated_training_streaming_pipeline.py | 9 +- .../production/grafana/run.py | 4 - .../production/morpheus/benchmarks/README.md | 10 +- .../benchmarks/benchmark_conf_generator.py | 1 - .../benchmarks/resource/pipelines_conf.json | 60 ++-- .../notebooks/dfp_azure_inference.ipynb | 4 - .../dfp_azure_integrated_training.ipynb | 3 +- .../notebooks/dfp_azure_training.ipynb | 4 - .../notebooks/dfp_duo_inference.ipynb | 4 - .../dfp_duo_integrated_training.ipynb | 1 - .../morpheus/notebooks/dfp_duo_training.ipynb | 4 - .../visualization/dfp_viz_azure_pipeline.py | 4 - .../visualization/dfp_viz_duo_pipeline.py | 4 - examples/doca/run_tcp.py | 3 - examples/doca/run_udp_convert.py | 3 - examples/doca/run_udp_raw.py | 3 - examples/doca/vdb_realtime/vdb.py | 3 - .../gnn_fraud_detection_pipeline/README.md | 6 +- examples/gnn_fraud_detection_pipeline/run.py | 3 - examples/llm/agents/README.md | 6 +- examples/llm/agents/run.py | 1 + examples/llm/agents/simple_pipeline.py | 10 +- examples/llm/cli.py | 10 +- examples/llm/completion/README.md | 5 +- examples/llm/completion/pipeline.py | 14 +- examples/llm/completion/run.py | 1 + examples/llm/vdb_upload/run.py | 2 +- examples/log_parsing/inference.py | 3 +- examples/ransomware_detection/README.md | 3 +- examples/ransomware_detection/run.py | 6 +- .../stages/create_features.py | 68 ++--- .../stages/preprocessing.py | 35 ++- examples/root_cause_analysis/README.md | 4 +- examples/sid_visualization/README.md | 4 +- examples/sid_visualization/run.py | 8 +- external/utilities | 2 +- models/model-cards/dfp-model-card.md | 2 +- morpheus.code-workspace | 12 + pyproject.toml | 5 +- .../morpheus/morpheus/_lib/common/module.cpp | 4 - .../include/morpheus/messages/control.hpp | 137 +--------- .../include/morpheus/utilities/cudf_util.hpp | 5 +- .../morpheus/_lib/messages/__init__.pyi | 2 +- .../morpheus/_lib/messages/module.cpp | 10 +- .../morpheus/_lib/src/messages/control.cpp | 115 ++++++-- .../morpheus/_lib/src/messages/meta.cpp | 3 +- .../morpheus/_lib/src/utilities/cudf_util.cpp | 10 +- .../morpheus/morpheus/_lib/stages/module.cpp | 4 - .../_lib/tests/messages/test_dev_doc_ex3.cpp | 22 +- .../_lib/tests/messages/test_messages.hpp | 17 +- .../stages/test_triton_inference_stage.cpp | 17 +- .../morpheus/_lib/tests/test_file_in_out.cpp | 16 +- .../morpheus/_lib/tests/test_utils/common.cpp | 4 - python/morpheus/morpheus/cli/commands.py | 33 ++- python/morpheus/morpheus/config.py | 41 +++ .../controllers/file_to_df_controller.py | 6 +- .../filter_detections_controller.py | 13 +- .../mlflow_model_writer_controller.py | 11 +- .../controllers/monitor_controller.py | 18 +- .../morpheus/controllers/rss_controller.py | 15 +- python/morpheus/morpheus/io/deserializers.py | 49 ++-- python/morpheus/morpheus/io/serializers.py | 12 +- python/morpheus/morpheus/io/utils.py | 98 ++++++- python/morpheus/morpheus/messages/__init__.py | 4 +- .../morpheus/messages/control_message.py | 203 ++++++++++++++ .../messages/memory/inference_memory.py | 55 ++-- .../messages/memory/response_memory.py | 27 +- .../morpheus/messages/memory/tensor_memory.py | 38 ++- .../morpheus/messages/message_meta.py | 61 +---- .../morpheus/modules/filter_detections.py | 2 +- .../morpheus/modules/payload_batcher.py | 23 +- .../morpheus/morpheus/parsers/event_parser.py | 26 +- python/morpheus/morpheus/parsers/ip.py | 227 ++++++++------- .../morpheus/morpheus/parsers/url_parser.py | 70 +++-- .../morpheus/parsers/windows_event_parser.py | 21 +- python/morpheus/morpheus/parsers/zeek.py | 15 +- .../pipeline/execution_mode_mixins.py | 69 +++++ .../morpheus/pipeline/linear_pipeline.py | 1 + python/morpheus/morpheus/pipeline/pipeline.py | 3 + .../morpheus/pipeline/preallocator_mixin.py | 27 +- .../morpheus/pipeline/single_port_stage.py | 1 - .../morpheus/morpheus/pipeline/stage_base.py | 23 ++ .../morpheus/pipeline/stage_decorator.py | 87 ++++-- .../stages/boundary/linear_boundary_stage.py | 5 +- .../morpheus/stages/general/monitor_stage.py | 3 +- .../stages/general/multi_processing_stage.py | 7 + .../morpheus/stages/general/trigger_stage.py | 3 +- .../stages/inference/inference_stage.py | 37 +-- .../inference/triton_inference_stage.py | 10 +- .../stages/input/appshield_source_stage.py | 81 +++--- .../morpheus/stages/input/arxiv_source.py | 15 +- .../databricks_deltalake_source_stage.py | 19 +- .../stages/input/file_source_stage.py | 12 +- .../stages/input/http_client_source_stage.py | 26 +- .../stages/input/http_server_source_stage.py | 32 ++- .../input/in_memory_data_generation_stage.py | 5 +- .../stages/input/in_memory_source_stage.py | 7 +- .../stages/input/kafka_source_stage.py | 15 +- .../morpheus/stages/input/rss_source_stage.py | 8 +- .../stages/output/compare_dataframe_stage.py | 7 +- .../stages/output/http_client_sink_stage.py | 3 +- .../stages/output/http_server_sink_stage.py | 14 +- .../stages/output/in_memory_sink_stage.py | 3 +- .../write_to_databricks_deltalake_stage.py | 5 +- .../output/write_to_elasticsearch_stage.py | 5 +- .../stages/output/write_to_file_stage.py | 5 +- .../stages/output/write_to_kafka_stage.py | 3 +- .../postprocess/add_classifications_stage.py | 2 +- .../postprocess/add_scores_stage_base.py | 3 +- .../postprocess/filter_detections_stage.py | 5 +- .../postprocess/generate_viz_frames_stage.py | 12 +- .../stages/postprocess/serialize_stage.py | 5 +- .../stages/postprocess/timeseries_stage.py | 4 +- .../stages/postprocess/validation_stage.py | 2 +- .../stages/preprocess/deserialize_stage.py | 23 +- .../stages/preprocess/drop_null_stage.py | 21 +- .../preprocess/group_by_column_stage.py | 3 +- .../preprocess/preprocess_base_stage.py | 19 +- .../stages/preprocess/preprocess_fil_stage.py | 67 +---- .../stages/preprocess/preprocess_nlp_stage.py | 99 +------ python/morpheus/morpheus/utils/column_info.py | 29 +- python/morpheus/morpheus/utils/concat_df.py | 7 +- .../morpheus/morpheus/utils/module_utils.py | 14 +- .../morpheus/utils/schema_transforms.py | 23 +- python/morpheus/morpheus/utils/seed.py | 23 +- .../morpheus/morpheus/utils/type_aliases.py | 25 +- python/morpheus/morpheus/utils/type_utils.py | 258 ++++++++++++++++++ .../morpheus_dfp/messages/__init__.py | 13 - .../morpheus_dfp/messages/dfp_message_meta.py | 42 --- .../morpheus_dfp/modules/dfp_inference.py | 10 +- .../morpheus_dfp/modules/dfp_training.py | 7 +- .../stages/dfp_rolling_window_stage.py | 24 +- .../stages/dfp_split_users_stage.py | 32 ++- .../morpheus_dfp/utils/config_generator.py | 5 - .../morpheus_llm/_lib/llm/module.cpp | 4 - .../morpheus_llm/llm/nodes/extracter_node.py | 8 +- .../llm/task_handlers/simple_task_handler.py | 4 +- .../modules/output/write_to_vector_db.py | 16 +- .../service/vdb/faiss_vdb_service.py | 22 +- .../service/vdb/milvus_vector_db_service.py | 9 +- .../service/vdb/vector_db_service.py | 19 +- .../stages/llm/llm_engine_stage.py | 86 +++++- tests/_utils/dataset_manager.py | 46 ++-- tests/_utils/inference_worker.py | 2 - tests/_utils/stages/check_pre_alloc.py | 11 +- .../stages/control_message_pass_thru.py | 9 +- tests/_utils/stages/conv_msg.py | 35 ++- tests/_utils/stages/dfp_length_checker.py | 3 +- tests/_utils/stages/error_raiser.py | 3 +- .../stages/in_memory_multi_source_stage.py | 3 +- .../_utils/stages/in_memory_source_x_stage.py | 3 +- tests/_utils/stages/multi_port_pass_thru.py | 3 +- tests/_utils/stages/record_thread_id_stage.py | 3 +- tests/_utils/stages/split_stage.py | 3 +- .../test_bench_agents_simple_pipeline.py | 2 +- .../test_bench_completion_pipeline.py | 2 +- .../test_bench_rag_standalone_pipeline.py | 2 +- .../test_bench_vdb_upload_pipeline.py | 2 +- tests/conftest.py | 218 ++++++--------- .../developer_guide/test_pass_thru.py | 40 ++- .../gnn_fraud_detection_pipeline/conftest.py | 2 +- .../test_classification_stage.py | 2 +- .../test_graph_construction_stage.py | 2 +- .../test_graph_sage_stage.py | 2 +- .../common/test_content_extractor_module.py | 2 - .../llm/common/test_web_scraper_module.py | 2 - .../llm/common/test_web_scraper_stage.py | 2 - .../test_schema_transform_module.py | 2 - tests/examples/log_parsing/conftest.py | 2 +- tests/examples/log_parsing/test_inference.py | 24 +- .../log_parsing/test_postprocessing.py | 4 +- .../examples/ransomware_detection/conftest.py | 2 +- .../test_create_features.py | 101 ++----- .../test_preprocessing.py | 29 +- tests/morpheus/apps/test_abp.py | 154 +---------- tests/morpheus/apps/test_abp_kafka.py | 98 +------ tests/morpheus/apps/test_phishing.py | 73 +---- tests/morpheus/apps/test_phishing_kafka.py | 101 +------ tests/morpheus/apps/test_sid.py | 2 +- tests/morpheus/apps/test_sid_kafka.py | 100 +------ .../test_elasticsearch_controller.py | 6 - tests/morpheus/dfencoder/test_autoencoder.py | 2 +- tests/morpheus/dfencoder/test_pkg.py | 26 -- tests/morpheus/io/test_io_utils.py | 16 ++ .../morpheus/messages/test_control_message.py | 133 +++++---- tests/morpheus/messages/test_message_meta.py | 16 +- tests/morpheus/messages/test_messages.py | 18 +- tests/morpheus/messages/test_tensor_memory.py | 123 +++++---- .../modules/test_from_control_message.py | 4 +- .../morpheus/modules/test_payload_batcher.py | 4 +- .../modules/test_to_control_message.py | 2 +- .../parsers/test_windows_event_parser.py | 1 + tests/morpheus/pipeline/test_error_pipe.py | 7 +- .../morpheus/pipeline/test_execution_modes.py | 148 ++++++++++ tests/morpheus/pipeline/test_file_in_out.py | 9 +- tests/morpheus/pipeline/test_pipe_viz.py | 9 +- tests/morpheus/pipeline/test_pipeline.py | 2 +- .../pipeline/test_preallocation_pipe.py | 10 +- .../morpheus/pipeline/test_stage_decorator.py | 34 +-- .../stages/test_add_classifications_stage.py | 22 +- .../morpheus/stages/test_add_scores_stage.py | 21 +- .../stages/test_appshield_source_stage.py | 30 +- .../stages/test_deserialize_stage_pipe.py | 2 +- .../morpheus/stages/test_file_source_stage.py | 31 +++ .../stages/test_file_source_stage_pipe.py | 4 +- .../stages/test_filter_detections_stage.py | 65 ++--- .../test_filter_detections_stage_pipe.py | 10 +- .../stages/test_generate_viz_frames_stage.py | 4 +- .../stages/test_http_server_sink_stage.py | 2 +- .../stages/test_http_server_source_stage.py | 7 +- tests/morpheus/stages/test_inference_stage.py | 25 +- .../stages/test_kafka_source_stage_pipe.py | 4 + .../stages/test_linear_modules_stage.py | 26 +- .../stages/test_ml_flow_drift_stage.py | 7 +- tests/morpheus/stages/test_monitor_stage.py | 2 +- .../stages/test_multi_port_modules_stage.py | 2 +- .../stages/test_multi_processing_stage.py | 35 ++- .../stages/test_preprocess_fil_stage.py | 21 +- .../stages/test_preprocess_nlp_stage.py | 36 +-- .../stages/test_rss_source_stage_pipe.py | 14 +- tests/morpheus/stages/test_serialize_stage.py | 13 +- .../morpheus/stages/test_timeseries_stage.py | 7 +- .../stages/test_triton_inference_stage.py | 25 +- .../test_write_to_elasticsearch_stage_pipe.py | 17 +- .../stages/test_write_to_file_stage.py | 48 ---- .../stages/test_write_to_kafka_stage_pipe.py | 15 +- tests/morpheus/test_cli.py | 1 - tests/morpheus/test_config.py | 45 +++ tests/morpheus/utils/test_column_info.py | 17 -- .../morpheus/utils/test_directory_watcher.py | 1 - tests/morpheus/utils/test_inference_worker.py | 6 +- tests/morpheus/utils/test_module_utils.py | 1 - tests/morpheus/utils/test_type_utils.py | 107 ++++++++ tests/morpheus_dfp/conftest.py | 26 +- .../morpheus_dfp/modules/test_dfp_training.py | 9 +- .../stages/test_dfp_mlflow_model_writer.py | 7 +- .../stages/test_dfp_rolling_window_stage.py | 80 +++--- .../stages/test_dfp_split_users_stage.py | 26 +- .../llm/nodes/test_extractor_node.py | 1 + .../llm/nodes/test_manual_extractor_node.py | 1 + .../task_handlers/test_simple_task_handler.py | 1 + .../morpheus_llm/llm/test_vdb_upload_pipe.py | 1 - .../stages/test_llm_engine_stage_pipe.py | 4 - ...st_milvus_write_to_vector_db_stage_pipe.py | 2 +- tests/test_conftest.py | 155 ++++++----- 286 files changed, 3601 insertions(+), 3147 deletions(-) create mode 100755 docker/run_container.sh create mode 100644 examples/cpu_only/README.md create mode 100644 examples/cpu_only/run.py create mode 100644 python/morpheus/morpheus/messages/control_message.py create mode 100644 python/morpheus/morpheus/pipeline/execution_mode_mixins.py delete mode 100644 python/morpheus_dfp/morpheus_dfp/messages/__init__.py delete mode 100644 python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py delete mode 100755 tests/morpheus/dfencoder/test_pkg.py create mode 100755 tests/morpheus/pipeline/test_execution_modes.py create mode 100755 tests/morpheus/stages/test_file_source_stage.py delete mode 100755 tests/morpheus/stages/test_write_to_file_stage.py create mode 100644 tests/morpheus/utils/test_type_utils.py diff --git a/ci/vale/styles/config/vocabularies/morpheus/accept.txt b/ci/vale/styles/config/vocabularies/morpheus/accept.txt index 157edebd18..285a85c7d8 100644 --- a/ci/vale/styles/config/vocabularies/morpheus/accept.txt +++ b/ci/vale/styles/config/vocabularies/morpheus/accept.txt @@ -18,6 +18,9 @@ CMake Conda CPython [Cc]ryptocurrenc[y|ies] +cuDF +cuML +CuPy [Cc]yber [Cc]ybersecurity Cython @@ -51,7 +54,9 @@ NeMo nginx NIC NIM(s?) +NumPy NVIDIA +pandas [Pp]arallelization [Pp]arsable PCIe diff --git a/conda/environments/examples_cuda-125_arch-x86_64.yaml b/conda/environments/examples_cuda-125_arch-x86_64.yaml index ffcae28e4a..e387e2c9bf 100644 --- a/conda/environments/examples_cuda-125_arch-x86_64.yaml +++ b/conda/environments/examples_cuda-125_arch-x86_64.yaml @@ -42,6 +42,7 @@ dependencies: - pip - pluggy=1.3 - pydantic +- pynvml=11.4 - pypdf=3.17.4 - pypdfium2=4.30 - python-confluent-kafka>=1.9.2,<1.10.0a0 diff --git a/dependencies.yaml b/dependencies.yaml index 95809bb0ee..05393f209c 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -150,6 +150,7 @@ files: arch: [x86_64] includes: - cve-mitigation + - example-abp-nvsmi - example-dfp-prod - example-gnn - example-llms @@ -442,6 +443,12 @@ dependencies: - dgl==2.0.0 - dglgo + example-abp-nvsmi: + common: + - output_types: [conda] + packages: + - pynvml=11.4 + example-llms: common: - output_types: [conda] diff --git a/docker/run_container.sh b/docker/run_container.sh new file mode 100755 index 0000000000..7d368556ef --- /dev/null +++ b/docker/run_container.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Color variables +b="\033[0;36m" +g="\033[0;32m" +r="\033[0;31m" +e="\033[0;90m" +y="\033[0;33m" +x="\033[0m" + +_UNDEF_VAR_ERROR_MSG="Use the dev/release scripts to set these automatically" + +DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:?"Must set \$DOCKER_IMAGE_NAME. ${_UNDEF_VAR_ERROR_MSG}"} +DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:?"Must set \$DOCKER_IMAGE_TAG. ${_UNDEF_VAR_ERROR_MSG}"} + +# DOCKER_ARGS are set by the dev/release scripts +# DOCKER_EXTRA_ARGS are optionally set by the user +DOCKER_ARGS=${DOCKER_ARGS:-""} +DOCKER_ARGS="${DOCKER_ARGS} --net=host --cap-add=sys_nice ${DOCKER_EXTRA_ARGS}" +DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} + +if [[ -n "${CPU_ONLY}" ]]; then + echo -e "${b}Executing in CPU only mode${x}" + DOCKER_ARGS="${DOCKER_ARGS} --runtime=runc" +else + echo -e "${b}Executing in GPU mode${x}" + DOCKER_ARGS="${DOCKER_ARGS} --runtime=nvidia --gpus=all" +fi + +if [[ -n "${SSH_AUTH_SOCK}" ]]; then + echo -e "${b}Setting up ssh-agent auth socket${x}" + DOCKER_ARGS="${DOCKER_ARGS} -v $(readlink -f $SSH_AUTH_SOCK):/ssh-agent:ro -e SSH_AUTH_SOCK=/ssh-agent" +fi + +echo -e "${g}Launching ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}...${x}" + +# Enable command logging to show what is being executed +set -x +docker run ${DOCA_EXTRA_ARGS} --rm -ti ${DOCKER_ARGS} ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" + +{ EXIT_CODE=$?; set +x; } 2>/dev/null + +exit $EXIT_CODE diff --git a/docker/run_container_dev.sh b/docker/run_container_dev.sh index 9a2db756af..0caa949c80 100755 --- a/docker/run_container_dev.sh +++ b/docker/run_container_dev.sh @@ -14,38 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -# set -x +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -# Color variables -b="\033[0;36m" -g="\033[0;32m" -r="\033[0;31m" -e="\033[0;90m" -y="\033[0;33m" -x="\033[0m" +export DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"morpheus"} +export DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"dev-$(date +'%y%m%d')"} -DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"morpheus"} -DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"dev-$(date +'%y%m%d')"} -DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} +export DOCKER_ARGS="-v $PWD:/workspace -v /dev/hugepages:/dev/hugepages --privileged" -DOCKER_ARGS="--runtime=nvidia --env WORKSPACE_VOLUME=${PWD} -v $PWD:/workspace --net=host --gpus=all --cap-add=sys_nice" - -if [[ -n "${SSH_AUTH_SOCK}" ]]; then - echo -e "${b}Setting up ssh-agent auth socket${x}" - DOCKER_ARGS="${DOCKER_ARGS} -v $(readlink -f $SSH_AUTH_SOCK):/ssh-agent:ro -e SSH_AUTH_SOCK=/ssh-agent" -fi - -echo -e "${g}Launching ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}...${x}" - -set -x -docker run \ - -v /dev/hugepages:/dev/hugepages \ - --privileged \ - --rm \ - -ti \ - ${DOCKER_ARGS} ${DOCKER_EXTRA_ARGS} \ - ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" - -{ EXIT_CODE=$?; set +x; } 2>/dev/null - -exit $EXIT_CODE +# Call the general run script +${SCRIPT_DIR}/run_container.sh diff --git a/docker/run_container_release.sh b/docker/run_container_release.sh index dce2132b1a..5ea4e3fd74 100755 --- a/docker/run_container_release.sh +++ b/docker/run_container_release.sh @@ -16,48 +16,23 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -# Color variables -b="\033[0;36m" -g="\033[0;32m" -r="\033[0;31m" -e="\033[0;90m" -y="\033[0;33m" -x="\033[0m" - # Change to the script file to ensure we are in the correct repo (in case were in a submodule) pushd ${SCRIPT_DIR} &> /dev/null MORPHEUS_SUPPORT_DOCA=${MORPHEUS_SUPPORT_DOCA:-OFF} -MORPHEUS_BUILD_MORPHEUS_LLM=${MORPHEUS_BUILD_MORPHEUS_LLM:-ON} -MORPHEUS_BUILD_MORPHEUS_DFP=${MORPHEUS_BUILD_MORPHEUS_DFP:-ON} - -DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"nvcr.io/nvidia/morpheus/morpheus"} -DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"$(git describe --tags --abbrev=0)-runtime"} -# This variable is used for passing extra arguments to the docker run command. Do not use DOCKER_ARGS for this purpose. -DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""} +export DOCKER_IMAGE_NAME=${DOCKER_IMAGE_NAME:-"nvcr.io/nvidia/morpheus/morpheus"} +export DOCKER_IMAGE_TAG=${DOCKER_IMAGE_TAG:-"$(git describe --tags --abbrev=0)-runtime"} popd &> /dev/null -DOCKER_ARGS="--runtime=nvidia --env WORKSPACE_VOLUME=${PWD} --net=host --gpus=all --cap-add=sys_nice ${DOCKER_EXTRA_ARGS}" - -if [[ -n "${SSH_AUTH_SOCK}" ]]; then - echo -e "${b}Setting up ssh-agent auth socket${x}" - DOCKER_ARGS="${DOCKER_ARGS} -v $(readlink -f $SSH_AUTH_SOCK):/ssh-agent:ro -e SSH_AUTH_SOCK=/ssh-agent" -fi - -# DPDK requires hugepage and privileged container -DOCA_EXTRA_ARGS="" +# DPDK (and thus DOCA) requires hugepage and privileged container +export DOCKER_ARGS="" if [[ ${MORPHEUS_SUPPORT_DOCA} == @(TRUE|ON) ]]; then - echo -e "${b}Enabling DOCA Support. Mounting /dev/hugepages and running in privileged mode${x}" + echo -e "Enabling DOCA Support. Mounting /dev/hugepages and running in privileged mode" DOCKER_ARGS="${DOCKER_ARGS} -v /dev/hugepages:/dev/hugepages --privileged" fi - -echo -e "${g}Launching ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}...${x}" - -# Enable command logging to show what is being executed -set -x -docker run ${DOCA_EXTRA_ARGS} --rm -ti ${DOCKER_ARGS} ${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG} "${@:-bash}" -set +x +# Call the general run script +${SCRIPT_DIR}/run_container.sh diff --git a/docs/README.md b/docs/README.md index 4fe4c43e58..b0a3162a6a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -17,18 +17,10 @@ # Building Documentation -Additional packages required for building the documentation are defined in `./conda_docs.yml`. - -## Install Additional Dependencies -From the root of the Morpheus repo: -```bash -conda env update --solver=libmamba -n morpheus --file conda/environments/dev_cuda-125_arch-x86_64.yaml --prune -``` - ## Build Morpheus and Documentation ``` CMAKE_CONFIGURE_EXTRA_ARGS="-DMORPHEUS_BUILD_DOCS=ON" ./scripts/compile.sh --target morpheus_docs ``` Outputs to `build/docs/html` - + If the documentation build is unsuccessful, refer to the **Out of Date Build Cache** section in [Troubleshooting](./source/extra_info/troubleshooting.md) to troubleshoot. diff --git a/docs/source/basics/building_a_pipeline.md b/docs/source/basics/building_a_pipeline.md index 65fadb0cf6..06985d5ef6 100644 --- a/docs/source/basics/building_a_pipeline.md +++ b/docs/source/basics/building_a_pipeline.md @@ -107,7 +107,7 @@ morpheus --log_level=DEBUG run pipeline-other \ Then the following error displays: ``` -RuntimeError: The to-file stage cannot handle input of . Accepted input types: (,) +RuntimeError: The to-file stage cannot handle input of . Accepted input types: (,) ``` This indicates that the ``to-file`` stage cannot accept the input type of `morpheus.messages.ControlMessage`. This is because the ``to-file`` stage has no idea how to write that class to a file; it only knows how to write instances of `morpheus.messages.message_meta.MessageMeta`. To ensure you have a valid pipeline, examine the `Accepted input types: (,)` portion of the message. This indicates you need a stage that converts from the output type of the `deserialize` stage, `ControlMessage`, to `MessageMeta`, which is exactly what the `serialize` stage does. @@ -207,7 +207,7 @@ This example shows an NLP Pipeline which uses several stages available in Morphe #### Launching Triton Run the following to launch Triton and load the `sid-minibert` model: ```bash -docker run --rm -ti --gpus=all -p8000:8000 -p8001:8001 -p8002:8002 nvcr.io/nvidia/morpheus/morpheus-tritonserver-models:24.10 --model-repository=/models/triton-model-repo --exit-on-error=false --model-control-mode=explicit --load-model sid-minibert-onnx +docker run --rm -ti --gpus=all -p8000:8000 -p8001:8001 -p8002:8002 nvcr.io/nvidia/morpheus/morpheus-tritonserver-models:24.10 tritonserver --model-repository=/models/triton-model-repo --exit-on-error=false --model-control-mode=explicit --load-model sid-minibert-onnx ``` #### Launching Kafka @@ -216,15 +216,15 @@ Follow steps 1-8 in [Quick Launch Kafka Cluster](../developer_guide/contributing ![../img/nlp_kitchen_sink.png](../img/nlp_kitchen_sink.png) ```bash -morpheus --log_level=INFO run --num_threads=8 --pipeline_batch_size=1024 --model_max_batch_size=32 \ +morpheus --log_level=INFO run --pipeline_batch_size=1024 --model_max_batch_size=32 \ pipeline-nlp --viz_file=.tmp/nlp_kitchen_sink.png \ from-file --filename examples/data/pcap_dump.jsonlines \ deserialize \ preprocess \ - inf-triton --model_name=sid-minibert-onnx --server_url=localhost:8001 \ + inf-triton --model_name=sid-minibert-onnx --server_url=localhost:8000 \ monitor --description "Inference Rate" --smoothing=0.001 --unit "inf" \ add-class \ - filter --threshold=0.8 \ + filter --filter_source=TENSOR --threshold=0.8 \ serialize --include 'timestamp' --exclude '^_ts_' \ to-kafka --bootstrap_servers localhost:9092 --output_topic "inference_output" \ monitor --description "ToKafka Rate" --smoothing=0.001 --unit "msg" diff --git a/docs/source/basics/overview.rst b/docs/source/basics/overview.rst index c6f6b2a348..61aef2e8b6 100644 --- a/docs/source/basics/overview.rst +++ b/docs/source/basics/overview.rst @@ -39,16 +39,22 @@ run: $ morpheus run --help Usage: morpheus run [OPTIONS] COMMAND [ARGS]... + Run subcommand, used for running a pipeline + Options: - --num_threads INTEGER RANGE Number of internal pipeline threads to use [default: 12; x>=1] + --num_threads INTEGER RANGE Number of internal pipeline threads to use [default: 64; x>=1] --pipeline_batch_size INTEGER RANGE Internal batch size for the pipeline. Can be much larger than the model batch size. Also used for Kafka consumers [default: 256; x>=1] --model_max_batch_size INTEGER RANGE Max batch size to use for the model [default: 8; x>=1] --edge_buffer_size INTEGER RANGE - The size of buffered channels to use between nodes in a pipeline. Larger values reduce backpressure at the cost of memory. Smaller values will push - messages through the pipeline quicker. Must be greater than 1 and a power of 2 (i.e. 2, 4, 8, 16, etc.) [default: 128; x>=2] - --use_cpp BOOLEAN Whether or not to use C++ node and message types or to prefer python. Only use as a last resort if bugs are encountered [default: True] + The size of buffered channels to use between nodes in a pipeline. Larger values reduce backpressure at the cost of memory. Smaller + values will push messages through the pipeline quicker. Must be greater than 1 and a power of 2 (i.e. 2, 4, 8, 16, etc.) [default: + 128; x>=2] + --use_cpp BOOLEAN [Deprecated] Whether or not to use C++ node and message types or to prefer python. Only use as a last resort if bugs are encountered. + Cannot be used with --use_cpu_only [default: True] + --use_cpu_only Whether or not to run in CPU only mode, setting this to True will disable C++ mode. Cannot be used with --use_cpp + --manual_seed INTEGER RANGE Manually seed the random number generators used by Morpheus, useful for testing. [x>=1] --help Show this message and exit. Commands: @@ -56,6 +62,7 @@ run: pipeline-nlp Run the inference pipeline with a NLP model pipeline-other Run a custom inference pipeline without a specific model type + Currently, Morpheus pipeline can be operated in four different modes. * ``pipeline-fil`` diff --git a/docs/source/cloud_deployment_guide.md b/docs/source/cloud_deployment_guide.md index 3d7da855c7..fd79c0f05e 100644 --- a/docs/source/cloud_deployment_guide.md +++ b/docs/source/cloud_deployment_guide.md @@ -438,11 +438,9 @@ Pipeline example to read data from a file, run inference using a `phishing-bert- ```bash helm install --set ngc.apiKey="$API_KEY" \ --set sdk.args="morpheus --log_level=DEBUG run \ - --num_threads=2 \ --edge_buffer_size=4 \ --pipeline_batch_size=1024 \ --model_max_batch_size=32 \ - --use_cpp=True \ pipeline-nlp \ --model_seq_length=128 \ --labels_file=data/labels_phishing.txt \ @@ -468,11 +466,9 @@ Pipeline example to read messages from an input Kafka topic, run inference using ```bash helm install --set ngc.apiKey="$API_KEY" \ --set sdk.args="morpheus --log_level=DEBUG run \ - --num_threads=2 \ --edge_buffer_size=4 \ --pipeline_batch_size=1024 \ --model_max_batch_size=32 \ - --use_cpp=True \ pipeline-nlp \ --model_seq_length=128 \ --labels_file=data/labels_phishing.txt \ @@ -515,9 +511,7 @@ Pipeline example to read data from a file, run inference using a `sid-minibert-o ```bash helm install --set ngc.apiKey="$API_KEY" \ --set sdk.args="morpheus --log_level=DEBUG run \ - --num_threads=3 \ --edge_buffer_size=4 \ - --use_cpp=True \ --pipeline_batch_size=1024 \ --model_max_batch_size=32 \ pipeline-nlp \ @@ -544,9 +538,7 @@ Pipeline example to read messages from an input Kafka topic, run inference using ```bash helm install --set ngc.apiKey="$API_KEY" \ --set sdk.args="morpheus --log_level=DEBUG run \ - --num_threads=3 \ --edge_buffer_size=4 \ - --use_cpp=True \ --pipeline_batch_size=1024 \ --model_max_batch_size=32 \ pipeline-nlp \ @@ -589,11 +581,9 @@ Pipeline example to read data from a file, run inference using an `abp-nvsmi-xgb ```bash helm install --set ngc.apiKey="$API_KEY" \ --set sdk.args="morpheus --log_level=DEBUG run \ - --num_threads=3 \ --edge_buffer_size=4 \ --pipeline_batch_size=1024 \ --model_max_batch_size=64 \ - --use_cpp=True \ pipeline-fil --columns_file=data/columns_fil.txt \ from-file --filename=./examples/data/nvsmi.jsonlines \ monitor --description 'FromFile Rate' --smoothing=0.001 \ @@ -615,10 +605,8 @@ Pipeline example to read messages from an input Kafka topic, run inference using ```bash helm install --set ngc.apiKey="$API_KEY" \ --set sdk.args="morpheus --log_level=DEBUG run \ - --num_threads=3 \ --pipeline_batch_size=1024 \ --model_max_batch_size=64 \ - --use_cpp=True \ pipeline-fil --columns_file=data/columns_fil.txt \ from-kafka --input_topic --bootstrap_servers broker:9092 \ monitor --description 'FromKafka Rate' --smoothing=0.001 \ diff --git a/docs/source/developer_guide/contributing.md b/docs/source/developer_guide/contributing.md index bf3f29e197..1e1123be55 100644 --- a/docs/source/developer_guide/contributing.md +++ b/docs/source/developer_guide/contributing.md @@ -153,14 +153,12 @@ This workflow utilizes a Docker container to set up most dependencies ensuring a If a Conda environment on the host machine is preferred over Docker, it is relatively easy to install the necessary dependencies (In reality, the Docker workflow creates a Conda environment inside the container). -Note: These instructions assume the user is using `mamba` instead of `conda` since its improved solver speed is very helpful when working with a large number of dependencies. If you are not familiar with `mamba` you can install it with `conda install -n base -c conda-forge mamba` (Make sure to only install into the base environment). `mamba` is a drop in replacement for `conda` and all Conda commands are compatible between the two. - #### Prerequisites - Volta architecture GPU or better - [CUDA 12.1](https://developer.nvidia.com/cuda-12-1-0-download-archive) -- `conda` and `mamba` - - If `conda` and `mamba` are not installed, we recommend using the MiniForge install guide which is located [here](https://github.com/conda-forge/miniforge). This will install both `conda` and `mamba` and set the channel default to use `conda-forge`. +- `conda` + - If `conda` is not installed, we recommend using the [MiniForge install guide](https://github.com/conda-forge/miniforge). This will install `conda` and set the channel default to use `conda-forge`. 1. Set up environment variables and clone the repo: ```bash @@ -168,13 +166,10 @@ Note: These instructions assume the user is using `mamba` instead of `conda` sin git clone https://github.com/nv-morpheus/Morpheus.git $MORPHEUS_ROOT cd $MORPHEUS_ROOT ``` - -2. Ensure all submodules are checked out: - -```bash -git submodule update --init --recursive -``` - +1. Ensure all submodules are checked out: + ```bash + git submodule update --init --recursive + ``` 1. Create the Morpheus Conda environment ```bash conda env create --solver=libmamba -n morpheus --file conda/environments/dev_cuda-125_arch-x86_64.yaml @@ -182,19 +177,18 @@ git submodule update --init --recursive ``` This creates a new environment named `morpheus`, and activates that environment. -1. Build Morpheus + + > **Note**: The `dev_cuda-121_arch-x86_64.yaml` Conda environment file specifies all of the dependencies required to build Morpheus and run Morpheus. However many of the examples, and optional packages such as `morpheus_llm` require additional dependencies. Alternately the following command can be used to create the Conda environment: ```bash - ./scripts/compile.sh + conda env create --solver=libmamba -n morpheus --file conda/environments/all_cuda-121_arch-x86_64.yaml + conda activate morpheus ``` - This script will run both CMake Configure with default options and CMake build. -1. Install Morpheus +1. Build Morpheus ```bash - pip install -e ${MORPHEUS_ROOT}/python/morpheus - pip install -e ${MORPHEUS_ROOT}/python/morpheus_llm - pip install -e ${MORPHEUS_ROOT}/python/morpheus_dfp + ./scripts/compile.sh ``` - Once Morpheus has been built, it can be installed into the current virtual environment. -1. Test the build (Note: some tests will be skipped)\ + This script will build and install Morpheus into the Conda environment. +1. Test the build (Note: some tests will be skipped) Some of the tests will rely on external data sets. ```bash MORPHEUS_ROOT=${PWD} @@ -213,15 +207,26 @@ git submodule update --init --recursive npm install -g camouflage-server@0.15 ``` - Run all tests: - ```bash - pytest --run_slow - ``` -1. Optional: Install cuML - - Many users may wish to install cuML. Due to the complex dependency structure and versioning requirements, we need to specify exact versions of each package. The command to accomplish this is: + - Run end-to-end (aka slow) tests: + ```bash + pytest --run_slow + ``` +1. Optional: Run Kafka and Milvus tests + - Download Kafka: ```bash - mamba install -c rapidsai -c nvidia -c conda-forge cuml=23.06 + python ./ci/scripts/download_kafka.py ``` + + - Run all tests (this will skip over tests that require optional dependencies which are not installed): + ```bash + pytest --run_slow --run_kafka --run_milvus + ``` + + - Run all tests including those that require optional dependencies: + ```bash + pytest --fail_missing --run_slow --run_kafka --run_milvus + ``` + 1. Run Morpheus ```bash morpheus run pipeline-nlp ... @@ -372,6 +377,36 @@ Due to the large number of dependencies, it's common to run into build issues. T - Message indicating `git apply ...` failed - Many of the dependencies require small patches to make them work. These patches must be applied once and only once. If this error displays, try deleting the offending package from the `build/_deps/` directory or from `.cache/cpm/`. - If all else fails, delete the entire `build/` directory and `.cache/` directory. + - Older build artifacts when performing an in-place build. + - When built with `MORPHEUS_PYTHON_INPLACE_BUILD=ON` compiled libraries will be deployed in-place in the source tree, and older build artifacts exist in the source tree. Remove these with: + ```bash + find ./python -name "*.so" -delete + find ./examples -name "*.so" -delete + ``` + - Issues building documentation + - Intermediate documentation build artifacts can cause errors for Sphinx. To remove these, run: + ```bash + rm -rf build/docs/ docs/source/_modules docs/source/_lib + ``` + - CI Issues + - To run CI locally, the `ci/scripts/run_ci_local.sh` script can be used. For example to run a local CI build: + ```bash + ci/scripts/run_ci_local.sh build + ``` + - Build artifacts resulting from a local CI run can be found in the `.tmp/local_ci_tmp/` directory. + - To troubleshoot a particular CI stage it can be helpful to run: + ```bash + ci/scripts/run_ci_local.sh bash + ``` + + This will open a bash shell inside the CI container with all of the environment variables typically set during a CI run. From here you can run the commands that would typically be run by one of the CI scripts in `ci/scripts/github`. + + To run a CI stage requiring a GPU (ex: `test`), set the `USE_GPU` environment variable to `1`: + ```bash + USE_GPU=1 ci/scripts/run_ci_local.sh bash + ``` + +Refer to the [troubleshooting guide](../extra_info/troubleshooting.md) for more information on common issues and how to resolve them. ## Licensing Morpheus is licensed under the Apache v2.0 license. All new source files including CMake and other build scripts should contain the Apache v2.0 license header. Any edits to existing source code should update the date range of the copyright to the current year. The format for the license header is: @@ -401,7 +436,7 @@ Third-party code included in the source tree (that is not pulled in as an extern Ex: ``` /** - * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) , NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/docs/source/developer_guide/guides/10_modular_pipeline_digital_fingerprinting.md b/docs/source/developer_guide/guides/10_modular_pipeline_digital_fingerprinting.md index e48b8c6df2..74ddb500cb 100644 --- a/docs/source/developer_guide/guides/10_modular_pipeline_digital_fingerprinting.md +++ b/docs/source/developer_guide/guides/10_modular_pipeline_digital_fingerprinting.md @@ -539,7 +539,6 @@ To run the DFP pipelines with the example datasets within the container, run the ```bash python dfp_integrated_training_batch_pipeline.py \ --log_level DEBUG \ - --use_cpp=true \ --source duo \ --start_time "2022-08-01" \ --duration "60d" \ @@ -551,7 +550,6 @@ To run the DFP pipelines with the example datasets within the container, run the ```bash python dfp_integrated_training_batch_pipeline.py \ --log_level DEBUG \ - --use_cpp=true \ --source duo \ --start_time "2022-08-30" \ --input_file "./control_messages/duo_payload_inference.json" @@ -561,7 +559,6 @@ To run the DFP pipelines with the example datasets within the container, run the ```bash python dfp_integrated_training_batch_pipeline.py \ --log_level DEBUG \ - --use_cpp=true \ --source duo \ --start_time "2022-08-01" \ --duration "60d" \ @@ -573,7 +570,6 @@ To run the DFP pipelines with the example datasets within the container, run the ```bash python dfp_integrated_training_batch_pipeline.py \ --log_level DEBUG \ - --use_cpp=true \ --source azure \ --start_time "2022-08-01" \ --duration "60d" \ @@ -585,7 +581,6 @@ To run the DFP pipelines with the example datasets within the container, run the ```bash python dfp_integrated_training_batch_pipeline.py \ --log_level DEBUG \ - --use_cpp=true \ --source azure \ --start_time "2022-08-30" \ --input_file "./control_messages/azure_payload_inference.json" @@ -595,7 +590,6 @@ To run the DFP pipelines with the example datasets within the container, run the ```bash python dfp_integrated_training_batch_pipeline.py \ --log_level DEBUG \ - --use_cpp=true \ --source azure \ --start_time "2022-08-01" \ --duration "60d" \ diff --git a/docs/source/developer_guide/guides/1_simple_python_stage.md b/docs/source/developer_guide/guides/1_simple_python_stage.md index 0ed1a08d59..27586de578 100644 --- a/docs/source/developer_guide/guides/1_simple_python_stage.md +++ b/docs/source/developer_guide/guides/1_simple_python_stage.md @@ -29,7 +29,7 @@ To start, we will implement a single stage that could be included in a pipeline. ### Stand-alone Function -The stand-alone function approach is the simplest way to define a stage. The function should accept a single argument, which will be the input message, and return a single value, which will be the output message. The function should be decorated with the `morpheus.pipeline.stage_decorator.stage` decorator. +The stand-alone function approach is the simplest way to define a stage. The function should accept a single argument, which will be the input message, and return a single value, which will be the output message. The function should be decorated with the {py:func}`~morpheus.pipeline.stage_decorator.stage` decorator. ```python import typing @@ -52,6 +52,20 @@ def pass_thru_stage(message: typing.Any) -> typing.Any: return message ``` +By default, Morpheus stages are assumed to require a GPU. However since this stage doesn't perform any specific GPU operations. We can indicate that the stage does not require a GPU by passing a tuple of supported execution modes to the decorator as follows: +```python +import typing + +from morpheus.config import ExecutionMode +from morpheus.pipeline.stage_decorator import stage + + +@stage(name="pass-thru", execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) +def pass_thru_stage(message: typing.Any) -> typing.Any: + # Return the message for the next stage + return message +``` + We can then add our stage to a pipeline as follows: ```python config = Config() @@ -60,7 +74,7 @@ pipeline = LinearPipeline(config) pipeline.add_stage(pass_thru_stage(config)) ``` -It is possible to provide additional keyword arguments to the function. Consider the following example: +It is also possible to provide additional keyword arguments to the function. Consider the following example: ```python @stage def multiplier(message: MessageMeta, *, column: str, value: int | float = 2.0) -> MessageMeta: @@ -76,11 +90,13 @@ pipe.add_stage(multiplier(config, column='probs', value=5)) ### Stage Class -The class based approach to defining a stage offers a bit more flexibility, specifically the ability to validate constructor arguments, and perform any needed setup prior to being invoked in a pipeline. Defining this stage requires us to specify the stage type. Morpheus stages which contain a single input and a single output typically inherit from `SinglePortStage`. Stages that act as sources of data, in that they do not take an input from a prior stage but rather produce data from a source such as a file, Kafka service, or other external sources, will need to inherit from the `SingleOutputSource` base class. +The class based approach to defining a stage offers a bit more flexibility, specifically the ability to validate constructor arguments, and perform any needed setup prior to being invoked in a pipeline. Defining this stage requires us to specify the stage type. Morpheus stages which contain a single input and a single output typically inherit from {py:class}`~morpheus.pipeline.single_port_stage.SinglePortStage`. Stages that act as sources of data, in that they do not take an input from a prior stage but rather produce data from a source such as a file, Kafka service, or other external sources, will need to inherit from the {py:class}`~morpheus.pipeline.single_output_source.SingleOutputSource` base class. + +Stages in Morpheus define what types of data they accept, and the type of data that they emit. In this example we are emitting messages of the same type that is received, this is actually quite common and Morpheus provides a mixin class, {py:class}`~morpheus.pipeline.pass_thru_type_mixin.PassThruTypeMixin`, to simplify this. -Stages in Morpheus define what types of data they accept, and the type of data that they emit. In this example we are emitting messages of the same type that is received, this is actually quite common and Morpheus provides a mixin class, `PassThruTypeMixin`, to simplify this. +Similar to the function based stage, the class based stage will be not require a GPU, and we will indicate that it is able to be used in both GPU and CPU execution modes by utilizing the {py:class}`~morpheus.pipeline.execution_mode_mixins.GpuAndCpuMixin`. -Optionally, stages can be registered as a command with the Morpheus CLI using the `register_stage` decorator. This allows for pipelines to be constructed from both pre-built stages and custom user stages via the command line. Any constructor arguments will be introspected using [`numpydoc`](https://numpydoc.readthedocs.io/en/latest/) and exposed as command line flags. Similarly, the class's docstrings will be exposed in the help string of the stage on the command line. +Optionally, stages can be registered as a command with the Morpheus CLI using the {py:func}`~morpheus.cli.register_stage.register_stage` decorator. This allows for pipelines to be constructed from both pre-built stages and custom user stages via the command line. Any constructor arguments will be introspected using [`numpydoc`](https://numpydoc.readthedocs.io/en/latest/) and exposed as command line flags. Similarly, the class's docstrings will be exposed in the help string of the stage on the command line. We start our class definition with a few basic imports: @@ -91,12 +107,13 @@ import mrc from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @register_stage("pass-thru") -class PassThruStage(PassThruTypeMixin, SinglePortStage): +class PassThruStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): ``` There are four methods that need to be defined in our new subclass to implement the stage interface: `name`, `accepted_types`, `compute_schema`, `supports_cpp_node`, and `_build_single`. In practice, it is often necessary to define at least one more method which will perform the actual work of the stage; by convention, this method is typically named `on_data`, which we will define in our examples. @@ -108,7 +125,7 @@ There are four methods that need to be defined in our new subclass to implement return "pass-thru" ``` -The `accepted_types` method returns a tuple of message classes that this stage is able to accept as input. Morpheus uses this to validate that the parent of this stage emits a message that this stage can accept. Since our stage is a pass through, we will declare that we can accept any incoming message type. Note that production stages will often declare only a single Morpheus message class such as `MessageMeta` or `ControlMessage` (refer to the message classes defined in `morpheus.messages` for a complete list). +The `accepted_types` method returns a tuple of message classes that this stage is able to accept as input. Morpheus uses this to validate that the parent of this stage emits a message that this stage can accept. Since our stage is a pass through, we will declare that we can accept any incoming message type. Note that production stages will often declare only a single Morpheus message class such as {py:class}`~morpheus.messages.MessageMeta` or {py:class}`~morpheus.messages.ControlMessage` (refer to the message classes defined in {py:mod}`~morpheus.messages` for a complete list). ```python def accepted_types(self) -> tuple: return (typing.Any,) @@ -171,12 +188,13 @@ import mrc from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @register_stage("pass-thru") -class PassThruStage(PassThruTypeMixin, SinglePortStage): +class PassThruStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ A Simple Pass Through Stage """ @@ -191,12 +209,11 @@ class PassThruStage(PassThruTypeMixin, SinglePortStage): def supports_cpp_node(self) -> bool: return False - def on_data(self, message: typing.Any): + def on_data(self, message: typing.Any) -> typing.Any: # Return the message for the next stage return message - def _build_single(self, builder: mrc.Builder, - input_node: mrc.SegmentObject) -> mrc.SegmentObject: + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: node = builder.make_node(self.unique_name, ops.map(self.on_data)) builder.make_edge(input_node, node) diff --git a/docs/source/developer_guide/guides/2_real_world_phishing.md b/docs/source/developer_guide/guides/2_real_world_phishing.md index c821d16a0c..b1ae038f1a 100644 --- a/docs/source/developer_guide/guides/2_real_world_phishing.md +++ b/docs/source/developer_guide/guides/2_real_world_phishing.md @@ -29,7 +29,7 @@ For this task, we'll need to define a new stage, which we will call our `Recipie 1. Count the number of recipients in the email's metadata. 1. Emit a Morpheus `MessageMeta` object that will contain the record content along with the augmented metadata. -For this stage, the code will be similar to the previous example with a few notable changes. We will be working with the `MessageMeta` class. This is a Morpheus message containing a [cuDF](https://docs.rapids.ai/api/cudf/stable/) [DataFrame](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/dataframe/). Since we will expect our new stage to operate on `MessageMeta` types, our new `accepted_types` method is defined as: +For this stage, the code will be similar to the previous example with a few notable changes. We will be working with the {py:class}`~morpheus.messages.MessageMeta` class. This is a Morpheus message containing a [cuDF](https://docs.rapids.ai/api/cudf/stable/) [DataFrame](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/dataframe/). Since we will expect our new stage to operate on `MessageMeta` types, our new `accepted_types` method is defined as: ```python def accepted_types(self) -> tuple: @@ -99,13 +99,13 @@ def __init__(self, config: Config): Refer to the [Stage Constructors](#stage-constructors) section for more details. -Since the purpose of this stage is specifically tied to pre-processing text data for an NLP pipeline, when we register the stage, we will explicitly limit the stage to NLP pipelines: +Since the purpose of this stage is specifically tied to pre-processing text data for an NLP pipeline, when we register the stage, we will explicitly limit the stage to NLP pipelines. In addition to this since the pipeline our stage is operating in is a GPU pipeline, we will not be utilizing the `GpuAndCpuMixin` mixin from the previous example.: ```python @register_stage("recipient-features", modes=[PipelineModes.NLP]) class RecipientFeaturesStage(PassThruTypeMixin, SinglePortStage): ``` -Our `_build_single` method remains unchanged from the previous example; even though we are modifying the incoming messages, our input and output types remain the same and we continue to make use of the `PassThruTypeMixin`. +Our `_build_single` method remains unchanged from the previous example; even though we are modifying the incoming messages, our input and output types remain the same and we continue to make use of the {py:class}`~morpheus.pipeline.pass_thru_type_mixin.PassThruTypeMixin`. ### The Completed Preprocessing Stage @@ -540,7 +540,7 @@ MORPHEUS_ROOT = os.environ['MORPHEUS_ROOT'] default="phishing-bert-onnx", help="The name of the model that is deployed on Tritonserver.", ) -@click.option("--server_url", default='localhost:8001', help="Tritonserver url.") +@click.option("--server_url", default='localhost:8000', help="Tritonserver url.") @click.option( "--output_file", default=os.path.join(tempfile.gettempdir(), "detections.jsonlines"), @@ -630,7 +630,7 @@ morpheus --log_level=debug --plugin examples/developer_guide/2_1_real_world_phis recipient-features \ deserialize \ preprocess --vocab_hash_file=data/bert-base-uncased-hash.txt --truncation=true --do_lower_case=true --add_special_tokens=false \ - inf-triton --model_name=phishing-bert-onnx --server_url=localhost:8001 --force_convert_inputs=true \ + inf-triton --model_name=phishing-bert-onnx --server_url=localhost:8000 --force_convert_inputs=true \ monitor --description="Inference Rate" --smoothing=0.001 --unit=inf \ add-scores --label=is_phishing \ serialize \ @@ -639,7 +639,7 @@ morpheus --log_level=debug --plugin examples/developer_guide/2_1_real_world_phis ## Stage Constructors -In our `RecipientFeaturesStage` example we added a constructor to our stage, however we didn't go into much detail on the implementation. Every stage constructor must receive an instance of a `morpheus.config.Config` object as its first argument and is then free to define additional stage-specific arguments after that. The Morpheus configuration object will contain configuration parameters needed by multiple stages in the pipeline, and the constructor in each Morpheus stage is free to inspect these. In contrast, parameters specific to a single stage are typically defined as constructor arguments. It is a best practice to perform any necessary validation checks in the constructor, and raising an exception in the case of mis-configuration. This allows us to fail early rather than after the pipeline has started. +In our `RecipientFeaturesStage` example we added a constructor to our stage, however we didn't go into much detail on the implementation. Every stage constructor must receive an instance of a {py:class}`~morpheus.config.Config` object as its first argument and is then free to define additional stage-specific arguments after that. The Morpheus configuration object will contain configuration parameters needed by multiple stages in the pipeline, and the constructor in each Morpheus stage is free to inspect these. In contrast, parameters specific to a single stage are typically defined as constructor arguments. It is a best practice to perform any necessary validation checks in the constructor, and raising an exception in the case of mis-configuration. This allows us to fail early rather than after the pipeline has started. In our `RecipientFeaturesStage` example, we hard-coded the Bert separator token. Let's instead refactor the code to receive that as a constructor argument. This new constructor argument is documented following the [`numpydoc`](https://numpydoc.readthedocs.io/en/latest/format.html#parameters) formatting style allowing it to be documented properly for both API and CLI users. Let's also take the opportunity to verify that the pipeline mode is set to `morpheus.config.PipelineModes.NLP`. @@ -742,12 +742,18 @@ Options: ### Class Based Approach -Creating a new source stage is similar to defining any other stage with a few differences. First, we will be subclassing `SingleOutputSource` including the `PreallocatorMixin`. Second, the required methods are the `name` property, `_build_source`, `compute_schema` and `supports_cpp_node` methods. +Creating a new source stage is similar to defining any other stage with a few differences. First, we will be subclassing {py:class}`~morpheus.pipeline.single_output_source.SingleOutputSource` and including the `PreallocatorMixin`. Second, the required methods are the `name` property, `_build_source`, `compute_schema` and `supports_cpp_node` methods. In this example, we will create a source that reads messages from a [RabbitMQ](https://www.rabbitmq.com/) queue using the [pika](https://pika.readthedocs.io/en/stable/#) client for Python. For simplicity, we will assume that authentication is not required for our RabbitMQ exchange and that the body of the RabbitMQ messages will be JSON formatted. Both authentication and support for other formats could be easily added later. The `PreallocatorMixin` when added to a stage class, typically a source stage, indicates that the stage emits newly constructed DataFrames either directly or contained in a `MessageMeta` instance into the pipeline. Adding this mixin allows any columns needed by other stages to be inserted into the DataFrame. +Similar to the pass through stage, this new source stage should be able to operate in both GPU and CPU execution modes, as such we will be using the `GpuAndCpuMixin` mixin. One thing to note is that the DataFrame payload of a `MessageMeta` object is always a `cudf.DataFrame` when running in GPU mode and a `pandas.DataFrame` when running in CPU mode. When supporting both GPU and CPU execution modes, care must be taken to avoid directly importing `cudf` (or any other package requiring a GPU) when running in CPU mode on a system without a GPU and would therefore result in an error. Stages are able to examine the execution mode with the `morpheus.config.Config.execution_mode` attribute. The {py:func}`~morpheus.utils.type_utils.get_df_pkg` helper method is used to import the appropriate DataFrame package based on the execution mode in the constructor: +```python + # This will return either cudf.DataFrame or pandas.DataFrame depending on the execution mode + self._df_pkg = get_df_pkg(config.execution_mode) +``` + The `compute_schema` method allows us to define our output type of `MessageMeta`, we do so by calling the `set_type` method of the `output_schema` attribute of the `StageSchema` object passed into the method. Of note here is that it is perfectly valid for a stage to determine its output type based upon configuration arguments passed into the constructor. However the stage must document a single output type per output port. If a stage emitted multiple output types, then the types must share a common base class which would serve as the stage's output type. ```python def compute_schema(self, schema: StageSchema): @@ -771,7 +777,7 @@ def source_generator(self, subscription: mrc.Subscription) -> collections.abc.It if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = self._df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) @@ -799,20 +805,20 @@ import mrc import pandas as pd import pika -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_utils import get_df_pkg logger = logging.getLogger(__name__) @register_stage("from-rabbitmq") -class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): +class RabbitMQSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Source stage used to load messages from a RabbitMQ queue. @@ -854,6 +860,9 @@ class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): self._poll_interval = pd.Timedelta(poll_interval) + # This will return either cudf.DataFrame or pandas.DataFrame depending on the execution mode + self._df_pkg = get_df_pkg(config.execution_mode) + @property def name(self) -> str: return "from-rabbitmq" @@ -874,7 +883,7 @@ class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = self._df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) @@ -889,7 +898,7 @@ class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): ``` ### Function Based Approach -Similar to the `stage` decorator used in previous examples Morpheus provides a `source` decorator which wraps a generator function to be used as a source stage. In the class based approach we explicitly added the `PreallocatorMixin`, when using the `source` decorator the return type annotation will be inspected and a stage will be created with the `PreallocatorMixin` if the return type is a `DataFrame` type or a message which contains a `DataFrame` (`MessageMeta` and `ControlMessage`). +Similar to the `stage` decorator used in previous examples Morpheus provides a {py:func}`~morpheus.pipeline.stage_decorator.source` decorator which wraps a generator function to be used as a source stage. In the class based approach we explicitly added the `PreallocatorMixin`, when using the `source` decorator the return type annotation will be inspected and a stage will be created with the `PreallocatorMixin` if the return type is a `DataFrame` type or a message which contains a `DataFrame` (`MessageMeta` and `ControlMessage`). We will also indicate which execution modes are supported by the stage by setting the `execution_modes` argument to the decorator. The code for the function will first perform the same setup as was used in the class constructor, then entering a nearly identical loop as that in the `source_generator` method. @@ -903,15 +912,15 @@ import mrc import pandas as pd import pika -import cudf - +from morpheus.config import ExecutionMode from morpheus.messages.message_meta import MessageMeta from morpheus.pipeline.stage_decorator import source +from morpheus.utils.type_utils import get_df_pkg logger = logging.getLogger(__name__) -@source(name="from-rabbitmq") +@source(name="from-rabbitmq", execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) def rabbitmq_source(subscription: mrc.Subscription, host: str, exchange: str, @@ -950,13 +959,15 @@ def rabbitmq_source(subscription: mrc.Subscription, poll_interval = pd.Timedelta(poll_interval) + df_pkg = get_df_pkg() + try: while subscription.is_subscribed(): (method_frame, _, body) = channel.basic_get(queue_name) if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) @@ -980,7 +991,7 @@ The code for our sink will be similar to other stages with a few changes. First, ```python @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): ``` Our sink will function as a pass-through allowing the possibility of other sinks to be added to the pipeline. We could, hypothetically, have a pipeline where we emit the results to both RabbitMQ and a file. For this reason we will also be using the `PassThruTypeMixin`. @@ -995,16 +1006,21 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> return node ``` -Similar to our previous examples, most of the actual business logic of the stage is contained in the `on_data` method. In this case, we grab a reference to the [cuDF](https://docs.rapids.ai/api/cudf/stable/) [DataFrame](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/dataframe/) attached to the incoming message. We then serialize to an [`io.StringIO`](https://docs.python.org/3.10/library/io.html?highlight=stringio#io.StringIO) buffer, which is then sent to RabbitMQ. +Similar to our previous examples, most of the actual business logic of the stage is contained in the `on_data` method. In this case, we grab a reference to the DataFrane attached to the incoming message. We then serialize to an [`io.StringIO`](https://docs.python.org/3.10/library/io.html?highlight=stringio#io.StringIO) buffer, which is then sent to RabbitMQ. + +> **Note**: This stage supports both GPU and CPU execution modes. When running in GPU mode, the payload of a `MessageMeta` object is always a [cuDF](https://docs.rapids.ai/api/cudf/stable/) [DataFrame](https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/dataframe/). When running in CPU mode, the payload is always a [pandas](https://pandas.pydata.org/) [DataFrane](https://pandas.pydata.org/docs/reference/frame.html). In many cases the two will be API compatible without requiring any changes to the code. In some cases however, the API may differ slightly and there is a need to know the payload type, care must be taken not to directly import `cudf` or any other package requiring a GPU when running in CPU mode on a system without a GPU. Morpheus provides some helper methods to assist with this in the {py:mod}`~morpheus.utils.type_utils` module, such as {py:func}`~morpheus.utils.type_utils.is_cudf_type` and {py:func}`~morpheus.utils.type_utils.get_df_pkg_from_obj`. ```python -def on_data(self, message: MessageMeta): - df = message.df - buffer = StringIO() - df.to_json(buffer, orient='records', lines=True) - body = buffer.getvalue().strip() - self._channel.basic_publish(exchange=self._exchange, routing_key=self._routing_key, body=body) - return message + def on_data(self, message: MessageMeta) -> MessageMeta: + df = message.df + + buffer = StringIO() + df.to_json(buffer, orient='records', lines=True) + body = buffer.getvalue().strip() + + self._channel.basic_publish(exchange=self._exchange, routing_key=self._routing_key, body=body) + + return message ``` The two new methods introduced in this example are the `on_error` and `on_complete` methods. For both methods, we want to make sure the [connection](https://pika.readthedocs.io/en/stable/modules/connection.html) object is properly closed. @@ -1032,6 +1048,7 @@ import pika from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -1039,7 +1056,7 @@ logger = logging.getLogger(__name__) @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Source stage used to load messages from a RabbitMQ queue. diff --git a/docs/source/developer_guide/guides/3_simple_cpp_stage.md b/docs/source/developer_guide/guides/3_simple_cpp_stage.md index f21317475d..676a2df8a6 100644 --- a/docs/source/developer_guide/guides/3_simple_cpp_stage.md +++ b/docs/source/developer_guide/guides/3_simple_cpp_stage.md @@ -34,18 +34,13 @@ pip install ./ ## Overview Morpheus offers the choice of writing pipeline stages in either Python or C++. For many use cases, a Python stage is perfectly fine. However, in the event that a Python stage becomes a bottleneck for the pipeline, then writing a C++ implementation for the stage becomes advantageous. The C++ implementations of Morpheus stages and messages utilize the [pybind11](https://pybind11.readthedocs.io/en/stable/index.html) library to provide Python bindings. -We have been defining our stages in Python up to this point, the option of defining a C++ implementation is only available to stages implemented as classes. Many of the stages included with Morpheus have both a Python and a C++ implementation, and Morpheus will use the C++ implementations by default. You can explicitly disable the use of C++ stage implementations by calling `morpheus.config.CppConfig.set_should_use_cpp(False)`: +We have been defining our stages in Python up to this point, the option of defining a C++ implementation is only available to stages implemented as classes. Many of the stages included with Morpheus have both a Python and a C++ implementation, and Morpheus will use the C++ implementations by default when running in the GPU execution mode. When running in the CPU execution mode, Morpheus will always use the Python implementation. -```python -from morpheus.config import CppConfig -CppConfig.set_should_use_cpp(False) -``` +If a stage does not have a C++ implementation, Morpheus will fall back to the Python implementation without any additional configuration. Morpheus stages which only contain a C++ implementation, still require a Python class to register the stage, and provide the stage's configuration. -If a stage does not have a C++ implementation, Morpheus will fall back to the Python implementation without any additional configuration and operate in a hybrid execution mode. +In addition to C++ accelerated stage implementations, Morpheus also provides a C++ implementation for message primitives. When using the GPU GPU execution mode (the default), constructing one of the Python message classes defined under {py:mod}`~morpheus.messages` will return a Python object with bindings to the underlying C++ implementation. -In addition to C++ accelerated stage implementations, Morpheus also provides a C++ implementation for message primitives. When C++ execution is enabled, constructing one of the Python message classes defined under `morpheus.messages` will return a Python object with bindings to the underlying C++ implementation. - -Since we are defining our stages in Python, it becomes the responsibility of the Python stage to build a C++ accelerated node. This happens in the `_build_source` and `_build_single` methods. Ultimately it is the decision of a Python stage to build a Python node or a C++ node. It is perfectly acceptable to build a Python node when `morpheus.config.CppConfig.get_should_use_cpp()` is configured to `True`. It is not acceptable, however, to build a C++ node when `morpheus.config.CppConfig.get_should_use_cpp() == False`. The reason is the C++ implementations of Morpheus' messages can be consumed by Python and C++ stage implementations alike. However when `morpheus.config.CppConfig.get_should_use_cpp() == False`, the Python implementations of each message type will be used which cannot be consumed by the C++ implementations of stages. +Since we are defining our stages in Python, it becomes the responsibility of the Python stage to build a C++ accelerated node. This happens in the `_build_source` and `_build_single` methods. The Python stage should call `self._build_cpp_node()` to determine if a C++ node should be built, and ultimately it is the decision of a Python stage to build a Python node or a C++ node. It is perfectly acceptable to build a Python node when `self._build_cpp_node()` is returns `True`. It is not acceptable, however, to build a C++ node when `self._build_cpp_node()` returns `False`. The reason is the C++ implementations of Morpheus messages can be consumed by Python and C++ stage implementations alike. However the Python implementations of Morpheus messages cannot be consumed by the C++ implementations of stages. Python stages which have a C++ implementation must advertise this functionality by returning a value of `True` from the `supports_cpp_node` method: @@ -84,7 +79,7 @@ Both the `PythonSource` and `PythonNode` classes are defined in the `pymrc/node. As in our Python guide, we will start with a simple pass through stage which can be used as a starting point for future development of other stages. Note that by convention, C++ classes in Morpheus have the same name as their corresponding Python classes and are located under a directory named `_lib`. We will be following that convention. To start, we will create a `_lib` directory and a new empty `__init__.py` file. -While our Python implementation accepts messages of any type (in the form of Python objects), on the C++ side we don't have that flexibility since our node is subject to C++ static typing rules. In practice, this isn't a limitation as we usually know which specific message types we need to work with. For this example we will be working with the `ControlMessage` as our input and output type, it is also a common base type for many other Morpheus message classes. This means that at build time our Python stage implementation is able to build a C++ node when the incoming type is `ControlMessage`, while falling back to the existing Python implementation otherwise. +While our Python implementation accepts messages of any type (in the form of Python objects), on the C++ side we don't have that flexibility since our node is subject to C++ static typing rules. In practice, this isn't a limitation as we usually know which specific message types we need to work with. For this example we will be working with the `ControlMessage` as our input and output type. This means that at build time our Python stage implementation is able to build a C++ node when the incoming type is `ControlMessage`, while falling back to the existing Python implementation otherwise. To start with, we have our Morpheus and MRC-specific includes: @@ -371,7 +366,7 @@ def compute_schema(self, schema: StageSchema): ``` > **Note**: We are still using the `PassThruTypeMixin` to handle the requirements of setting the output type. -As mentioned in the previous section, our `_build_single` method needs to be updated to build a C++ node when the input type is `ControlMessage` and when `morpheus.config.CppConfig.get_should_use_cpp()` is `True` using the `self._build_cpp_node()` method. The `_build_cpp_node()` method compares both `morpheus.config.CppConfig.get_should_use_cpp()` and `supports_cpp_node()` and returns `True` only when both methods return `True`. +As mentioned in the previous section, our `_build_single` method needs to be updated to build a C++ node when the input type is `ControlMessage` and when `self._build_cpp_node()` returns `True`. ```python def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: @@ -398,13 +393,14 @@ from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema @register_stage("pass-thru") -class PassThruStage(PassThruTypeMixin, SinglePortStage): +class PassThruStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): def __init__(self, config: Config): super().__init__(config) @@ -438,11 +434,10 @@ class PassThruStage(PassThruTypeMixin, SinglePortStage): builder.make_edge(input_node, node) return node - ``` ## Testing the Stage -To test the updated stage we will build a simple pipeline using the Morpheus command line tool. In order to illustrate the stage building a C++ node only when the input type is a `ControlMessage` we will insert the `pass-thru` stage in twice in the pipeline. In the first instance the input type will be `MessageMeta` and the stage will fallback to using a Python node, and in the second instance the input type will be a `ControlMessage` and the stage will build a C++ node. +To test the updated stage we will build a simple pipeline using the Morpheus command line tool. In order to illustrate the stage building a C++ node only when the input type is a `ControlMessage` we will insert the `pass-thru` stage in twice in the pipeline. In the first instance the input type will be `MessageMeta` and the stage will fallback to using a Python node, and in the second instance the input type will be a `ControlMessage` and the stage will build a C++ node. ```bash PYTHONPATH="examples/developer_guide/3_simple_cpp_stage/src" \ diff --git a/docs/source/developer_guide/guides/4_source_cpp_stage.md b/docs/source/developer_guide/guides/4_source_cpp_stage.md index 49beda09c2..02049e595b 100644 --- a/docs/source/developer_guide/guides/4_source_cpp_stage.md +++ b/docs/source/developer_guide/guides/4_source_cpp_stage.md @@ -36,6 +36,8 @@ For this example, we are going to add a C++ implementation for the `RabbitMQSour For communicating with [RabbitMQ](https://www.rabbitmq.com/) we will be using the [SimpleAmqpClient](https://github.com/alanxz/SimpleAmqpClient) library, and [libcudf](https://docs.rapids.ai/api/libcudf/stable/index.html) for constructing the `DataFrame`. +> **Note**: Since the C++ implementation will only be used when the execution mode is set to GPU. It is safe to assume the C++ implementation will always interact with cuDF DataFrames, and the Python implementation will always interact with pandas DataFrames. + ## Header Definition Our includes: @@ -477,7 +479,8 @@ PYBIND11_MODULE(rabbitmq_cpp_stage, m) ## Python Changes -Previously, our stage connected to the RabbitMQ server in the constructor. This is no longer advantageous to us when C++ execution is enabled. Instead, we will record our constructor arguments and move the connection code to a new `connect` method. Our new constructor and `connect` methods are updated to: +Previously, our stage connected to the RabbitMQ server in the constructor. This is no longer advantageous to us when C++ execution is enabled. Instead, we will record our constructor arguments and move the connection code to a new `connect` method. Since this stage's C++ implementation will always be used when running in GPU mode, we can assume the Python implementation will always interact with pandas DataFrames. +Our new constructor and `connect` methods are updated to: ```python def __init__(self, @@ -513,7 +516,7 @@ def connect(self): self._channel.queue_bind(exchange=self._exchange, queue=self._queue_name) ``` -Lastly, our `_build_source` method needs to be updated to build a C++ node when `morpheus.config.CppConfig.get_should_use_cpp()` is configured to `True` by using the `self._build_cpp_node()` method. +Lastly, our `_build_source` method needs to be updated to build a C++ node when `self._build_cpp_node()` returns `True`. ```python def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: diff --git a/docs/source/developer_guide/guides/5_digital_fingerprinting.md b/docs/source/developer_guide/guides/5_digital_fingerprinting.md index 7ada67d7c8..e96f7526bc 100644 --- a/docs/source/developer_guide/guides/5_digital_fingerprinting.md +++ b/docs/source/developer_guide/guides/5_digital_fingerprinting.md @@ -155,11 +155,14 @@ docker compose build > This is most likely due to using an older version of the `docker-compose` command, instead re-run the build with `docker compose`. Refer to [Migrate to Compose V2](https://docs.docker.com/compose/migrate/) for more information. #### Downloading the example datasets -First, we will need to install `s3fs` and then run the `examples/digital_fingerprinting/fetch_example_data.py` script. This will download the example data into the `examples/data/dfp` dir. +First, we will need to install additional requirements in to the Conda environment. Then run the `examples/digital_fingerprinting/fetch_example_data.py` script. This will download the example data into the `examples/data/dfp` dir. From the Morpheus repo, run: ```bash -pip install s3fs +conda env update --solver=libmamba \ + -n ${CONDA_DEFAULT_ENV} \ + --file ./conda/environments/examples_cuda-121_arch-x86_64.yaml + python examples/digital_fingerprinting/fetch_example_data.py all ``` diff --git a/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md b/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md index 4f60bcf155..d60f64f19e 100644 --- a/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md +++ b/docs/source/developer_guide/guides/6_digital_fingerprinting_reference.md @@ -33,13 +33,10 @@ import os from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.cli.utils import get_package_relative_file from morpheus.utils.file_utils import load_labels_file ``` ```python -CppConfig.set_should_use_cpp(False) - config = Config() config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() @@ -88,7 +85,7 @@ Defines a single column and type-cast. | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | #### Custom Column (`CustomColumn`) Subclass of `ColumnInfo`, defines a column to be computed by a user-defined function `process_column_fn`. @@ -96,7 +93,7 @@ Subclass of `ColumnInfo`, defines a column to be computed by a user-defined func | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `process_column_fn` | `function` | Function which receives the entire `DataFrame` as its only input, returning a new [`pandas.Series`](https://pandas.pydata.org/docs/reference/api/pandas.Series.html) object to be stored in column `name`. | | `input_column_types` | `dict[str, str]` | The input columns and the expected [`dtype` strings](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) that are needed for this Column to successfully process. Setting this as `None` will pass all columns. Specifying which columns are needed improves performance. | @@ -139,7 +136,7 @@ Subclass of `RenameColumn`, specific to casting UTC localized `datetime` values. | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the destination column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `input_name` | `str` | Original column name | #### String-Join Column (`StringJoinColumn`) @@ -148,7 +145,7 @@ Subclass of `RenameColumn`, converts incoming `list` values to string by joining | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the destination column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `input_name` | `str` | Original column name | | `sep` | `str` | Separator string to use for the join | @@ -158,7 +155,7 @@ Subclass of `ColumnInfo`, concatenates values from multiple columns into a new s | Argument | Type | Description | | -------- | ---- | ----------- | | `name` | `str` | Name of the destination column | -| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | | `input_columns` | `List[str]` | List of columns to concatenate | | `sep` | `str` | Separator string | diff --git a/docs/source/developer_guide/guides/9_control_messages.md b/docs/source/developer_guide/guides/9_control_messages.md index 2be63e2081..f852f2cc86 100644 --- a/docs/source/developer_guide/guides/9_control_messages.md +++ b/docs/source/developer_guide/guides/9_control_messages.md @@ -32,13 +32,13 @@ Control Messages are straightforward objects that contain `tasks`, `metadata`, a Control Messages can handle tasks such as `training`, `inference`, and a catchall category `other`. Tasks can be added, checked for existence, or removed from the Control Message using methods like `add_task`, `has_task`, and `remove_task`. ```python -import morpheus._lib.messages as messages +from morpheus.messages import ControlMessage task_data = { "....": "...." } -msg = messages.ControlMessage() +msg = ControlMessage() msg.add_task("training", task_data) if msg.has_task("training"): task = msg.remove_task("training") @@ -49,9 +49,9 @@ if msg.has_task("training"): Metadata is a set of key-value pairs that offer supplementary information about the Control Message and must be JSON serializable. You can set, check, and retrieve metadata values using the `set_metadata`, `has_metadata`, and `get_metadata` methods, respectively. ```python -import morpheus._lib.messages as messages +from morpheus.messages import ControlMessage -msg = messages.ControlMessage() +msg = ControlMessage() msg.set_metadata("description", "This is a sample control message.") if msg.has_metadata("description"): description = msg.get_metadata("description") @@ -63,12 +63,13 @@ The payload of a Control Message is a Morpheus `MessageMeta` object that can car ```python import cudf -import morpheus._lib.messages as messages +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta data = cudf.DataFrame() # some data -msg_meta = messages.MessageMeta(data) -msg = messages.ControlMessage() +msg_meta = MessageMeta(data) +msg = ControlMessage() msg.payload(msg_meta) @@ -82,25 +83,18 @@ msg_meta == retrieved_payload # True **The `MultiMessage` type was deprecated in 24.06 and has been completely removed in version 24.10.** When upgrading to 24.10, all uses of `MultiMessage` need to be converted to `ControlMessage`. Each `MultiMessage` functionality has a corresponding equivalent in `ControlMessage`, as illustrated below. -```python -import cudf -from morpheus.messages import MultiMessage, ControlMessage - -data = cudf.DataFrame() -msg_meta = MessageMeta(data) -``` | **Functionality** | **MultiMessage** | **ControlMessage** | | -------------------------------------------------------------- | ------------------------------------------ | ------------------------------------------------------------------- | | Initialization | `multi_msg = MultiMessage(msg_meta)` | `control_msg = ControlMessage()`
`control_msg.payload(msg_meta)` | -| Get `cudf.DataFrame` | `multi_msg.get_meta()` | `control_msg.payload().get_data()` | -| Get columns from `cudf.DataFrame` | `multi_msg.get_meta(col_name)` | `control_msg.payload().get_data(col_name)` | -| Set columns values to `cudf.DataFrame` | `multi_msg.set_meta(col_name, value)` | `control_msg.payload().set_data(col_name, value)` | -| Get sliced `cudf.DataFrame` for given start and stop positions | `multi_msg.get_slice(start, stop)` | `control_msg.payload().get_slice(start, stop)` | -| Copy the `cudf.DataFrame` for given ranges of rows | `multi_msg.copy_ranges(ranges)` | `control_msg.payload().copy_ranges(ranges)` | +| Get `DataFrame` | `multi_msg.get_meta()` | `control_msg.payload().get_data()` | +| Get columns from `DataFrame` | `multi_msg.get_meta(col_name)` | `control_msg.payload().get_data(col_name)` | +| Set columns values to `DataFrame` | `multi_msg.set_meta(col_name, value)` | `control_msg.payload().set_data(col_name, value)` | +| Get sliced `DataFrame` for given start and stop positions | `multi_msg.get_slice(start, stop)` | `control_msg.payload().get_slice(start, stop)` | +| Copy the `DataFrame` for given ranges of rows | `multi_msg.copy_ranges(ranges)` | `control_msg.payload().copy_ranges(ranges)` | | | **MultiTensorMessage** | **ControlMessage** | -| Get the inference tensor `cupy.ndarray` | `multi_tensor_msg.tensor()` | `control_msg.tensors()` | +| Get the inference tensor `ndarray` | `multi_tensor_msg.tensor()` | `control_msg.tensors()` | | Get a specific inference tensor | `multi_tensor_msg.get_tensor(tensor_name)` | `control_msg.tensors().get_tensor(tensor_name)` | -Note that the `get_slice()` and `copy_ranges()` functions in `ControlMessage` return the `MessageMeta` after slicing, whereas these functions in `MultiMessage` return a new `MultiMessage` instance. +Note that in the `ControlMessage` column the `get_slice()` and `copy_ranges()` methods are being called on the `MessageMeta` payload and thus return a `MessageMeta` after slicing, whereas these functions in `MultiMessage` return a new `MultiMessage` instance. diff --git a/examples/abp_nvsmi_detection/README.md b/examples/abp_nvsmi_detection/README.md index eab83358e2..b29ad6bb84 100644 --- a/examples/abp_nvsmi_detection/README.md +++ b/examples/abp_nvsmi_detection/README.md @@ -61,7 +61,10 @@ In this example we will be using the `examples/data/nvsmi.jsonlines` dataset tha This example can be easily applied to datasets generated from your own NVIDIA GPU devices. If NetQ is not deployed in your environment, the `nvsmi_data_extract.py` script is provided which uses [pyNVML](https://pypi.org/project/nvidia-ml-py/) and [pandas](https://pandas.pydata.org/) to generate data similar to NetQ. `pyNVML` contains the Python bindings for NVIDIA Management Library (NVML), the same library used by `nvidia-smi`. -`pyNVML` and `pandas` come already installed on the Morpheus release and development Docker images. Otherwise, they will need to be installed before running the script. +pyNVML is not installed by default, use the following command to install it: +```bash +conda env update --solver=libmamba -n morpheus --file conda/environments/examples_cuda-121_arch-x86_64.yaml +``` Run the following to start generating your dataset: ``` diff --git a/examples/abp_pcap_detection/abp_pcap_preprocessing.py b/examples/abp_pcap_detection/abp_pcap_preprocessing.py index f4ebdfbb04..ebc0392217 100644 --- a/examples/abp_pcap_detection/abp_pcap_preprocessing.py +++ b/examples/abp_pcap_detection/abp_pcap_preprocessing.py @@ -16,17 +16,16 @@ from functools import partial import cupy as cp -import mrc import numpy as np import cudf -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemoryFIL from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage @@ -184,7 +183,7 @@ def round_time_kernel(timestamp, rollup_time, secs): seq_ids[:, 2] = fea_len - 1 # Create the inference memory. Keep in mind count here could be > than input count - memory = _messages.InferenceMemoryFIL(count=count, input__0=data, seq_ids=seq_ids) + memory = InferenceMemoryFIL(count=count, input__0=data, seq_ids=seq_ids) infer_message = ControlMessage(msg) infer_message.payload(meta) @@ -197,6 +196,3 @@ def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage fea_len=self._fea_length, fea_cols=self.features, req_cols=self.req_cols) - - def _get_preprocess_node(self, builder: mrc.Builder): - raise NotImplementedError("C++ node not implemented for this stage") diff --git a/examples/cpu_only/README.md b/examples/cpu_only/README.md new file mode 100644 index 0000000000..feac382a3f --- /dev/null +++ b/examples/cpu_only/README.md @@ -0,0 +1,72 @@ + + +# CPU Only Example Using Morpheus + +## Supported Environments +| Environment | Supported | Notes | +|-------------|-----------|-------| +| Conda | ✔ | | +| Morpheus Docker Container | ✔ | | +| Morpheus Release Container | ✔ | | +| Dev Container | ✔ | | + +## CPU Only Pipeline +This example demonstrates a simple Morpheus pipeline which is able to operate on a host without access GPU. + +> **Note**: A more complex example of a pipeline that can execute without a GPU is also available at `examples/llm/completion/README.md` + +From the root of the Morpheus repo, run: +```bash +python examples/cpu_only/run.py --help +``` + +Output: +``` +Usage: run.py [OPTIONS] + +Options: + --use_cpu_only Whether or not to run in CPU only mode, + setting this to True will disable C++ mode. + --log_level [CRITICAL|FATAL|ERROR|WARN|WARNING|INFO|DEBUG] + Specify the logging level to use. [default: + DEBUG] + --in_file PATH Input file [required] + --out_file FILE Output file [required] + --help Show this message and exit. +``` + +To launch the configured Morpheus pipeline with the sample data that is provided in `examples/data`, run the following: + +```bash +python examples/cpu_only/run.py --use_cpu_only --in_file=examples/data/email.jsonlines --out_file=.tmp/out.jsonlines +``` + +### CLI Example + +From the root of the Morpheus repo, run: +```bash +morpheus --log_level INFO \ + run --use_cpu_only \ + pipeline-other \ + from-file --filename=examples/data/email.jsonlines \ + monitor --description "source" \ + deserialize \ + monitor --description "deserialize" \ + serialize \ + to-file --filename=.tmp/out.jsonlines --overwrite +``` diff --git a/examples/cpu_only/run.py b/examples/cpu_only/run.py new file mode 100644 index 0000000000..f0a50a47e0 --- /dev/null +++ b/examples/cpu_only/run.py @@ -0,0 +1,140 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import pathlib +import sys +import typing + +import click + +from morpheus.cli.utils import get_log_levels +from morpheus.cli.utils import parse_log_level +from morpheus.config import Config +from morpheus.config import CppConfig +from morpheus.config import ExecutionMode +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta +from morpheus.pipeline.linear_pipeline import LinearPipeline +from morpheus.pipeline.stage_decorator import stage +from morpheus.stages.general.monitor_stage import MonitorStage +from morpheus.stages.general.trigger_stage import TriggerStage +from morpheus.stages.input.file_source_stage import FileSourceStage +from morpheus.stages.output.write_to_file_stage import WriteToFileStage +from morpheus.stages.postprocess.serialize_stage import SerializeStage +from morpheus.stages.preprocess.deserialize_stage import DeserializeStage +from morpheus.utils.logger import configure_logging + +logger = logging.getLogger(f"morpheus.{__name__}") + + +@click.command() +@click.option('--use_cpu_only', + default=False, + type=bool, + is_flag=True, + help=("Whether or not to run in CPU only mode, setting this to True will disable C++ mode.")) +@click.option("--log_level", + default="DEBUG", + type=click.Choice(get_log_levels(), case_sensitive=False), + callback=parse_log_level, + show_default=True, + help="Specify the logging level to use.") +@click.option( + "--in_file", + help="Input file", + required=True, + type=click.Path(exists=True, readable=True), +) +@click.option( + "--out_file", + help="Output file", + type=click.Path(dir_okay=False), + default="output.csv", + required=True, +) +def run_pipeline(log_level: int, use_cpu_only: bool, in_file: pathlib.Path, out_file: pathlib.Path): + # Enable the default logger + configure_logging(log_level=log_level) + + if use_cpu_only: + execution_mode = ExecutionMode.CPU + else: + execution_mode = ExecutionMode.GPU + + config = Config() + config.execution_mode = execution_mode + + pipeline = LinearPipeline(config) + + pipeline.set_source(FileSourceStage(config, filename=in_file)) + + pipeline.add_stage(MonitorStage(config, description="source")) + + pipeline.add_stage(TriggerStage(config)) + + @stage(execution_modes=(execution_mode, )) + def print_msg(msg: typing.Any) -> typing.Any: + log_msg = [f"Receive a message of type {type(msg)}"] + if isinstance(msg, MessageMeta): + log_msg.append(f"- df type: {type(msg.df)}") + + logger.debug(" ".join(log_msg)) + + return msg + + pipeline.add_stage(print_msg(config)) + + pipeline.add_stage(DeserializeStage(config)) + + pipeline.add_stage(MonitorStage(config, description="deserialize")) + + @stage(execution_modes=(execution_mode, )) + def calculate_totals(msg: ControlMessage, *, total_column_name: str = "total") -> ControlMessage: + meta = msg.payload() + + with meta.mutable_dataframe() as df: + logger.debug("Received a ControlMessage with a dataframe of type %s", type(df)) + df[total_column_name] = df.select_dtypes(include="number").sum(axis=1) + + return msg + + pipeline.add_stage(calculate_totals(config)) + pipeline.add_stage(SerializeStage(config)) + pipeline.add_stage(WriteToFileStage(config, filename=out_file, overwrite=True)) + pipeline.build() + + logger.info("Running pipeline\tC++ mode = %s\texecution_mode = %s", + CppConfig.get_should_use_cpp(), + config.execution_mode) + + pipeline.run() + + known_gpu_packages = ['cudf', 'cuml', 'tensorrt', 'torch'] + known_gpu_packages_loaded = [pkg in sys.modules for pkg in known_gpu_packages] + + if any(known_gpu_packages_loaded): + for (i, pkg) in enumerate(known_gpu_packages): + if known_gpu_packages_loaded[i]: + msg = f"{pkg} is loaded" + if use_cpu_only: + logger.error(msg) + else: + logger.info(msg) + else: + logger.info("No GPU packages loaded") + + +if __name__ == "__main__": + run_pipeline() diff --git a/examples/developer_guide/1_simple_python_stage/pass_thru.py b/examples/developer_guide/1_simple_python_stage/pass_thru.py index 7e6a8e125c..52edba71e7 100644 --- a/examples/developer_guide/1_simple_python_stage/pass_thru.py +++ b/examples/developer_guide/1_simple_python_stage/pass_thru.py @@ -19,12 +19,13 @@ from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @register_stage("pass-thru") -class PassThruStage(PassThruTypeMixin, SinglePortStage): +class PassThruStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ A Simple Pass Through Stage """ diff --git a/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py b/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py index da9c51fa9a..cd71e83b63 100644 --- a/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py +++ b/examples/developer_guide/1_simple_python_stage/pass_thru_deco.py @@ -15,10 +15,11 @@ import typing +from morpheus.config import ExecutionMode from morpheus.pipeline.stage_decorator import stage -@stage +@stage(name="pass-thru", execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) def pass_thru_stage(message: typing.Any) -> typing.Any: # Return the message for the next stage return message diff --git a/examples/developer_guide/2_1_real_world_phishing/run.py b/examples/developer_guide/2_1_real_world_phishing/run.py index b0907924aa..32e53042f7 100755 --- a/examples/developer_guide/2_1_real_world_phishing/run.py +++ b/examples/developer_guide/2_1_real_world_phishing/run.py @@ -75,7 +75,7 @@ default="phishing-bert-onnx", help="The name of the model that is deployed on Tritonserver.", ) -@click.option("--server_url", default='localhost:8001', help="Tritonserver url.") +@click.option("--server_url", default='localhost:8000', help="Tritonserver url.") @click.option( "--output_file", default=os.path.join(tempfile.gettempdir(), "detections.jsonlines"), diff --git a/examples/developer_guide/2_2_rabbitmq/README.md b/examples/developer_guide/2_2_rabbitmq/README.md index cadd6075a2..db9465a31e 100644 --- a/examples/developer_guide/2_2_rabbitmq/README.md +++ b/examples/developer_guide/2_2_rabbitmq/README.md @@ -54,6 +54,7 @@ If no exchange named 'logs' exists in RabbitMQ it will be created. By default th ## Launch the writer In a third terminal from the root of the Morpheus repo execute: ```bash +export MORPHEUS_ROOT=$(pwd) python examples/developer_guide/2_2_rabbitmq/write_simple.py ``` @@ -61,6 +62,7 @@ This will read JSON data from the `examples/data/email.jsonlines` file and publi The `write_simple.py` script will exit as soon as the message is written to the queue. The `read_simple.py` script will continue reading from the queue until explicitly shut down with a control-C. +> **Note**: Both the `read_simple.py` and `write_simple.py` scripts will launch independent Morpheus pipelines, both of which can optionally execute in CPU-only mode by setting the `--use_cpu_only` flag. ## Alternate Morpheus CLI usage In the above examples we defined the pipeline using the Python API in the `read_simple.py` and `write_simple.py` scripts. Alternately, we could have defined the same pipelines using the Morpheus CLI tool. diff --git a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py index 347d02e131..182e9e556f 100644 --- a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py +++ b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage.py @@ -22,20 +22,20 @@ import pandas as pd import pika -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_utils import get_df_pkg logger = logging.getLogger(__name__) @register_stage("from-rabbitmq") -class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): +class RabbitMQSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Source stage used to load messages from a RabbitMQ queue. @@ -77,6 +77,9 @@ def __init__(self, self._poll_interval = pd.Timedelta(poll_interval) + # This will return either cudf.DataFrame or pandas.DataFrame depending on the execution mode + self._df_pkg = get_df_pkg(config.execution_mode) + @property def name(self) -> str: return "from-rabbitmq" @@ -97,7 +100,7 @@ def source_generator(self, subscription: mrc.Subscription) -> collections.abc.It if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = self._df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) diff --git a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py index de24cf9873..58255bf557 100644 --- a/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py +++ b/examples/developer_guide/2_2_rabbitmq/rabbitmq_source_stage_deco.py @@ -22,15 +22,15 @@ import pandas as pd import pika -import cudf - +from morpheus.config import ExecutionMode from morpheus.messages.message_meta import MessageMeta from morpheus.pipeline.stage_decorator import source +from morpheus.utils.type_utils import get_df_pkg logger = logging.getLogger(__name__) -@source(name="from-rabbitmq") +@source(name="from-rabbitmq", execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) def rabbitmq_source(subscription: mrc.Subscription, host: str, exchange: str, @@ -69,13 +69,15 @@ def rabbitmq_source(subscription: mrc.Subscription, poll_interval = pd.Timedelta(poll_interval) + df_pkg = get_df_pkg() + try: while subscription.is_subscribed(): (method_frame, _, body) = channel.basic_get(queue_name) if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = df_pkg.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) diff --git a/examples/developer_guide/2_2_rabbitmq/read_simple.py b/examples/developer_guide/2_2_rabbitmq/read_simple.py index 2b26d2ba6a..c00e0728ed 100755 --- a/examples/developer_guide/2_2_rabbitmq/read_simple.py +++ b/examples/developer_guide/2_2_rabbitmq/read_simple.py @@ -22,6 +22,7 @@ from morpheus.common import FileTypes from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -33,12 +34,20 @@ is_flag=True, default=False, help="Use the function based version of the RabbitMQ source stage instead of the class") -def run_pipeline(use_source_function: bool): +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) +@click.option( + "--num_threads", + default=len(os.sched_getaffinity(0)), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +def run_pipeline(use_source_function: bool, use_cpu_only: bool, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) config = Config() - config.num_threads = len(os.sched_getaffinity(0)) + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + config.num_threads = num_threads # Create a linear pipeline object pipeline = LinearPipeline(config) diff --git a/examples/developer_guide/2_2_rabbitmq/write_simple.py b/examples/developer_guide/2_2_rabbitmq/write_simple.py index 78fa2c3d26..f2e6e76430 100755 --- a/examples/developer_guide/2_2_rabbitmq/write_simple.py +++ b/examples/developer_guide/2_2_rabbitmq/write_simple.py @@ -16,23 +16,34 @@ import logging import os +import click from write_to_rabbitmq_stage import WriteToRabbitMQStage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.input.file_source_stage import FileSourceStage from morpheus.utils.logger import configure_logging -def run_pipeline(): +@click.command() +@click.option('--input_file', + type=click.Path(exists=True, readable=True), + default=os.path.join(os.environ['MORPHEUS_ROOT'], 'examples/data/email.jsonlines')) +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) +@click.option( + "--num_threads", + default=len(os.sched_getaffinity(0)), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +def run_pipeline(use_cpu_only: bool, input_file: str, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) - root_dir = os.environ['MORPHEUS_ROOT'] - input_file = os.path.join(root_dir, 'examples/data/email.jsonlines') - config = Config() - config.num_threads = len(os.sched_getaffinity(0)) + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + config.num_threads = num_threads # Create a linear pipeline object pipeline = LinearPipeline(config) diff --git a/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py b/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py index 401d8b785e..fb5382eda6 100644 --- a/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py +++ b/examples/developer_guide/2_2_rabbitmq/write_to_rabbitmq_stage.py @@ -22,6 +22,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -29,7 +30,7 @@ @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Source stage used to load messages from a RabbitMQ queue. diff --git a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py index 3b71aa727f..9ea9d1d8f6 100644 --- a/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py +++ b/examples/developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py @@ -21,13 +21,14 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema @register_stage("pass-thru") -class PassThruStage(PassThruTypeMixin, SinglePortStage): +class PassThruStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): def __init__(self, config: Config): super().__init__(config) diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md index 988381e1c6..33db31f443 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/README.md +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/README.md @@ -18,8 +18,6 @@ limitations under the License. # Example RabbitMQ stages This example builds upon the `examples/developer_guide/2_2_rabbitmq` example adding a C++ implementation for the `RabbitMQSourceStage` along with adding package install scripts. -This example adds two flags to the `read_simple.py` script. A `--use_cpp` flag which defaults to `True` and a `--num_threads` flag which defaults to the number of cores on the system as returned by `len(os.sched_getaffinity(0))`. - ## Supported Environments | Environment | Supported | Notes | |-------------|-----------|-------| diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py index 453041534c..a408ca0b49 100755 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/rabbitmq_source_stage.py @@ -21,11 +21,10 @@ import pandas as pd import pika -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -34,7 +33,7 @@ @register_stage("from-rabbitmq") -class RabbitMQSourceStage(PreallocatorMixin, SingleOutputSource): +class RabbitMQSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Source stage used to load messages from a RabbitMQ queue. @@ -119,7 +118,7 @@ def source_generator(self, subscription: mrc.Subscription): if method_frame is not None: try: buffer = StringIO(body.decode("utf-8")) - df = cudf.io.read_json(buffer, orient='records', lines=True) + df = pd.read_json(buffer, orient='records', lines=True) yield MessageMeta(df=df) except Exception as ex: logger.exception("Error occurred converting RabbitMQ message to Dataframe: %s", ex) diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py index 401d8b785e..fb5382eda6 100644 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/rabbitmq_cpp_stage/write_to_rabbitmq_stage.py @@ -22,6 +22,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -29,7 +30,7 @@ @register_stage("to-rabbitmq") -class WriteToRabbitMQStage(PassThruTypeMixin, SinglePortStage): +class WriteToRabbitMQStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Source stage used to load messages from a RabbitMQ queue. diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py index b8271bb79a..66d5ffd76b 100755 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/read_simple.py @@ -21,7 +21,7 @@ from morpheus.common import FileTypes from morpheus.config import Config -from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -29,20 +29,19 @@ @click.command() -@click.option('--use_cpp', default=True) +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), type=click.IntRange(min=1), help="Number of internal pipeline threads to use", ) -def run_pipeline(use_cpp, num_threads): +def run_pipeline(use_cpu_only: bool, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(use_cpp) - config = Config() + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU config.num_threads = num_threads # Create a linear pipeline object diff --git a/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py b/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py index b9cdf761e5..a4954d8ae1 100755 --- a/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py +++ b/examples/developer_guide/4_rabbitmq_cpp_stage/src/write_simple.py @@ -16,23 +16,34 @@ import logging import os +import click from rabbitmq_cpp_stage.write_to_rabbitmq_stage import WriteToRabbitMQStage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline import LinearPipeline from morpheus.stages.input.file_source_stage import FileSourceStage from morpheus.utils.logger import configure_logging -def run_pipeline(): +@click.command() +@click.option('--input_file', + type=click.Path(exists=True, readable=True), + default=os.path.join(os.environ['MORPHEUS_ROOT'], 'examples/data/email.jsonlines')) +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help=("Whether or not to run in CPU only mode")) +@click.option( + "--num_threads", + default=len(os.sched_getaffinity(0)), + type=click.IntRange(min=1), + help="Number of internal pipeline threads to use", +) +def run_pipeline(use_cpu_only: bool, input_file: str, num_threads: int): # Enable the Morpheus logger configure_logging(log_level=logging.DEBUG) - root_dir = os.environ['MORPHEUS_ROOT'] - input_file = os.path.join(root_dir, 'examples/data/email.jsonlines') - config = Config() - config.num_threads = len(os.sched_getaffinity(0)) + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + config.num_threads = num_threads # Create a linear pipeline object pipeline = LinearPipeline(config) diff --git a/examples/digital_fingerprinting/production/dfp_azure_pipeline.py b/examples/digital_fingerprinting/production/dfp_azure_pipeline.py index dab4122ebd..d470217b83 100644 --- a/examples/digital_fingerprinting/production/dfp_azure_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_azure_pipeline.py @@ -32,7 +32,6 @@ from morpheus.common import FilterSource from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -230,9 +229,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/production/dfp_duo_pipeline.py b/examples/digital_fingerprinting/production/dfp_duo_pipeline.py index c1e3e00495..2cd08bfb7b 100644 --- a/examples/digital_fingerprinting/production/dfp_duo_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_duo_pipeline.py @@ -32,7 +32,6 @@ from morpheus.common import FilterSource from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -227,9 +226,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py b/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py index 5e857929f7..7782961760 100644 --- a/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_integrated_training_batch_pipeline.py @@ -74,12 +74,6 @@ default="60d", help="The training duration to run starting from start_time", ) -@click.option( - "--use_cpp", - type=click.BOOL, - default=True, - help=("Indicates what type of logs are going to be used in the workload."), -) @click.option( "--cache_dir", type=str, @@ -135,7 +129,6 @@ def run_pipeline(source: str, sample_rate_s: int, tracking_uri, silence_monitors, - use_cpp, mlflow_experiment_name_template, mlflow_model_name_template, **kwargs): @@ -167,7 +160,7 @@ def run_pipeline(source: str, # Default timestamp column -- override with ControlMessage timestamp_column_name = "timestamp" - config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name, use_cpp=use_cpp) + config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name) # Construct the data frame Schema used to normalize incoming data schema_builder = SchemaBuilder(config, source) diff --git a/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py b/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py index 587dc81358..198bfa528d 100644 --- a/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py +++ b/examples/digital_fingerprinting/production/dfp_integrated_training_streaming_pipeline.py @@ -74,12 +74,6 @@ default="60d", help="The training duration to run starting from start_time", ) -@click.option( - "--use_cpp", - type=click.BOOL, - default=True, - help=("Indicates what type of logs are going to be used in the workload."), -) @click.option( "--cache_dir", type=str, @@ -147,7 +141,6 @@ def run_pipeline(source: str, sample_rate_s: int, tracking_uri, silence_monitors, - use_cpp, mlflow_experiment_name_template, mlflow_model_name_template, **kwargs): @@ -180,7 +173,7 @@ def run_pipeline(source: str, # Default timestamp column -- override with ControlMessage timestamp_column_name = "timestamp" - config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name, use_cpp=use_cpp) + config: Config = generate_ae_config(source, userid_column_name, timestamp_column_name) # Construct the data frame Schema used to normalize incoming data schema_builder = SchemaBuilder(config, source) diff --git a/examples/digital_fingerprinting/production/grafana/run.py b/examples/digital_fingerprinting/production/grafana/run.py index 47d8e927d5..c62c0de1c6 100644 --- a/examples/digital_fingerprinting/production/grafana/run.py +++ b/examples/digital_fingerprinting/production/grafana/run.py @@ -34,7 +34,6 @@ from morpheus.common import FilterSource from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.output.write_to_file_stage import WriteToFileStage @@ -242,9 +241,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md b/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md index e43755094e..f27e29cdd4 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/README.md @@ -38,14 +38,9 @@ In the `/workspace` directory of the container, run the following to compile Mor ./scripts/compile.sh ``` -Now install Morpheus: -```bash -pip install -e /workspace -``` - Install additional required dependencies: ```bash -mamba env update \ +conda env update --solver=libmamba \ -n ${CONDA_DEFAULT_ENV} \ --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` @@ -87,8 +82,7 @@ Morpheus pipeline configurations for each workflow are managed using [pipelines_ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, ... ``` diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py b/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py index 480893e8b8..d8a1825b72 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/benchmark_conf_generator.py @@ -100,7 +100,6 @@ def _create_config(self): config = generate_ae_config(source=(self._pipe_conf.get('source')), userid_column_name=(self._pipe_conf.get('userid_column_name')), timestamp_column_name=(self._pipe_conf.get('timestamp_column_name')), - use_cpp=(self._pipe_conf.get('use_cpp')), pipeline_batch_size=(self._pipe_conf.get('pipeline_batch_size')), edge_buffer_size=(self._pipe_conf.get('edge_buffer_size')), num_threads=(self._pipe_conf.get('num_threads'))) diff --git a/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json b/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json index a15edde34f..6049014497 100644 --- a/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json +++ b/examples/digital_fingerprinting/production/morpheus/benchmarks/resource/pipelines_conf.json @@ -9,8 +9,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_payload_lti_e2e": { "message_path": "../control_messages/azure_payload_lti.json", @@ -21,8 +20,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_payload_lti_s3_e2e": { "message_path": "../control_messages/azure_payload_lti_s3.json", @@ -33,8 +31,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_payload_training_e2e": { "message_path": "../control_messages/azure_payload_training.json", @@ -45,8 +42,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_streaming_inference_e2e": { "message_path": "../control_messages/azure_streaming_inference.json", @@ -57,8 +53,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_streaming_lti_e2e": { "message_path": "../control_messages/azure_streaming_lti.json", @@ -69,8 +64,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_azure_streaming_training_e2e": { "message_path": "../control_messages/azure_streaming_training.json", @@ -81,8 +75,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": true + "source": "azure" }, "test_dfp_modules_duo_payload_inference_e2e": { "message_path": "../control_messages/duo_payload_inference.json", @@ -93,8 +86,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_payload_lti_e2e": { "message_path": "../control_messages/duo_payload_lti.json", @@ -105,8 +97,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_payload_only_load_e2e": { "message_path": "../control_messages/duo_payload_only_load.json", @@ -117,8 +108,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_payload_training_e2e": { "message_path": "../control_messages/duo_payload_training.json", @@ -129,8 +119,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_inference_e2e": { "message_path": "../control_messages/duo_streaming_inference.json", @@ -141,8 +130,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_lti_e2e": { "message_path": "../control_messages/duo_streaming_lti.json", @@ -153,8 +141,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_only_load_e2e": { "message_path": "../control_messages/duo_streaming_only_load.json", @@ -165,8 +152,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_payload_e2e": { "message_path": "../control_messages/duo_streaming_payload.json", @@ -177,8 +163,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_modules_duo_streaming_training_e2e": { "message_path": "../control_messages/duo_streaming_training.json", @@ -189,8 +174,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": true + "source": "duo" }, "test_dfp_stages_azure_training_e2e": { "glob_path": "../../../../data/dfp/azure-training-data/*.json", @@ -201,8 +185,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": false + "source": "azure" }, "test_dfp_stages_azure_inference_e2e": { "glob_path": "../../../../data/dfp/azure-inference-data/*.json", @@ -213,8 +196,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "azure", - "use_cpp": false + "source": "azure" }, "test_dfp_stages_duo_training_e2e": { "glob_path": "../../../../data/dfp/duo-training-data/*.json", @@ -225,8 +207,7 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": false + "source": "duo" }, "test_dfp_stages_duo_inference_e2e": { "glob_path": "../../../../data/dfp/duo-inference-data/*.json", @@ -237,7 +218,6 @@ "duration": "60d", "userid_column_name": "username", "timestamp_column_name": "timestamp", - "source": "duo", - "use_cpp": false + "source": "duo" } } diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb index 39be0c336f..7047b9003c 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_inference.ipynb @@ -65,7 +65,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", "from morpheus.utils.column_info import ColumnInfo\n", @@ -194,9 +193,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb index 0002a318b8..3377fb2158 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_integrated_training.ipynb @@ -187,8 +187,7 @@ "config: Config = generate_ae_config(\n", " source,\n", " userid_column_name=\"username\",\n", - " timestamp_column_name=\"timestamp\",\n", - " use_cpp=True,\n", + " timestamp_column_name=\"timestamp\"\n", ")\n", "\n", "# Construct the dataframe Schema which is used to normalize incoming azure logs\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb index 4547dea6e9..a30d892b5e 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_azure_training.ipynb @@ -62,7 +62,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.utils.column_info import ColumnInfo\n", "from morpheus.utils.column_info import DataFrameInputSchema\n", @@ -191,9 +190,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb index 675952b652..c407b5caef 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_inference.ipynb @@ -63,7 +63,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.stages.output.write_to_file_stage import WriteToFileStage\n", "from morpheus.utils.column_info import BoolColumn\n", @@ -193,9 +192,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb index 086786e9a1..60fbd83b5b 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_integrated_training.ipynb @@ -190,7 +190,6 @@ " source,\n", " userid_column_name=\"username\",\n", " timestamp_column_name=\"timestamp\",\n", - " use_cpp=True,\n", ")\n", "\n", "# Construct the dataframe Schema which is used to normalize incoming duo logs\n", diff --git a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb index a0a30e2c07..35a4fa02d5 100644 --- a/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb +++ b/examples/digital_fingerprinting/production/morpheus/notebooks/dfp_duo_training.ipynb @@ -62,7 +62,6 @@ "from morpheus.cli.utils import parse_log_level\n", "from morpheus.config import Config\n", "from morpheus.config import ConfigAutoEncoder\n", - "from morpheus.config import CppConfig\n", "from morpheus.pipeline import LinearPipeline\n", "from morpheus.utils.column_info import BoolColumn\n", "from morpheus.utils.column_info import ColumnInfo\n", @@ -192,9 +191,6 @@ "configure_logging(log_level=logging.DEBUG)\n", "\n", "config = Config()\n", - "\n", - "CppConfig.set_should_use_cpp(False)\n", - "\n", "config.num_threads = len(os.sched_getaffinity(0))\n", "\n", "config.ae = ConfigAutoEncoder()\n", diff --git a/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py b/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py index 09d6304042..0143dcc6c6 100644 --- a/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py +++ b/examples/digital_fingerprinting/visualization/dfp_viz_azure_pipeline.py @@ -30,7 +30,6 @@ from morpheus.common import FileTypes from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.utils.column_info import ColumnInfo @@ -180,9 +179,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py b/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py index f039644b77..475e34e245 100644 --- a/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py +++ b/examples/digital_fingerprinting/visualization/dfp_viz_duo_pipeline.py @@ -30,7 +30,6 @@ from morpheus.common import FileTypes from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.utils.column_info import BoolColumn @@ -183,9 +182,6 @@ def run_pipeline(train_users, logger.info("Tracking URI: %s", mlflow.get_tracking_uri()) config = Config() - - CppConfig.set_should_use_cpp(False) - config.num_threads = len(os.sched_getaffinity(0)) config.ae = ConfigAutoEncoder() diff --git a/examples/doca/run_tcp.py b/examples/doca/run_tcp.py index 5c4b4035a7..cf2e797efc 100644 --- a/examples/doca/run_tcp.py +++ b/examples/doca/run_tcp.py @@ -17,7 +17,6 @@ import click from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.doca.doca_convert_stage import DocaConvertStage @@ -71,8 +70,6 @@ def run_pipeline(pipeline_batch_size, model_max_batch_size, model_fea_length, ou # Enable the default logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP diff --git a/examples/doca/run_udp_convert.py b/examples/doca/run_udp_convert.py index 52c9b216b7..c88c80ac6c 100644 --- a/examples/doca/run_udp_convert.py +++ b/examples/doca/run_udp_convert.py @@ -19,7 +19,6 @@ from morpheus.cli.utils import get_log_levels from morpheus.cli.utils import parse_log_level from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.messages import RawPacketMessage from morpheus.pipeline.linear_pipeline import LinearPipeline @@ -90,8 +89,6 @@ def run_pipeline(nic_addr: str, # Enable the default logger configure_logging(log_level=log_level) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP diff --git a/examples/doca/run_udp_raw.py b/examples/doca/run_udp_raw.py index 576ecff957..cb31c5bb6c 100644 --- a/examples/doca/run_udp_raw.py +++ b/examples/doca/run_udp_raw.py @@ -17,7 +17,6 @@ import click from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.messages import RawPacketMessage from morpheus.pipeline.linear_pipeline import LinearPipeline @@ -41,8 +40,6 @@ def run_pipeline(nic_addr, gpu_addr): # Enable the default logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP diff --git a/examples/doca/vdb_realtime/vdb.py b/examples/doca/vdb_realtime/vdb.py index 79c9aee420..226213d2f6 100644 --- a/examples/doca/vdb_realtime/vdb.py +++ b/examples/doca/vdb_realtime/vdb.py @@ -18,7 +18,6 @@ import pymilvus from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.doca.doca_convert_stage import DocaConvertStage @@ -119,8 +118,6 @@ def run_pipeline(nic_addr: str, # Enable the default logger configure_logging(log_level=logging.DEBUG) - CppConfig.set_should_use_cpp(True) - config = Config() config.mode = PipelineModes.NLP config.pipeline_batch_size = 1024 diff --git a/examples/gnn_fraud_detection_pipeline/README.md b/examples/gnn_fraud_detection_pipeline/README.md index 3945eced97..c7206787a6 100644 --- a/examples/gnn_fraud_detection_pipeline/README.md +++ b/examples/gnn_fraud_detection_pipeline/README.md @@ -27,10 +27,10 @@ All environments require additional Conda packages which can be installed with e ## Requirements -Prior to running the GNN fraud detection pipeline, additional requirements must be installed in to your Conda environment. A supplemental requirements file has been provided in this example directory. +Prior to running the GNN fraud detection pipeline, additional requirements must be installed in to your Conda environment. ```bash -mamba env update \ +conda env update --solver=libmamba \ -n ${CONDA_DEFAULT_ENV} \ --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` @@ -117,7 +117,7 @@ From the root of the Morpheus repo, run: PYTHONPATH="examples" \ morpheus --log_level INFO \ --plugin "gnn_fraud_detection_pipeline" \ - run --use_cpp False --pipeline_batch_size 1024 --model_max_batch_size 32 --edge_buffer_size 4 \ + run --pipeline_batch_size 1024 --model_max_batch_size 32 --edge_buffer_size 4 \ pipeline-other --model_fea_length 70 --label=probs \ from-file --filename examples/gnn_fraud_detection_pipeline/validation.csv --filter_null False \ deserialize \ diff --git a/examples/gnn_fraud_detection_pipeline/run.py b/examples/gnn_fraud_detection_pipeline/run.py index 6a3268f174..27361d05ea 100644 --- a/examples/gnn_fraud_detection_pipeline/run.py +++ b/examples/gnn_fraud_detection_pipeline/run.py @@ -18,7 +18,6 @@ import click from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage @@ -100,8 +99,6 @@ def run_pipeline(num_threads, # Enable the default logger. configure_logging(log_level=logging.INFO) - CppConfig.set_should_use_cpp(False) - # Its necessary to get the global config object and configure it for FIL mode. config = Config() config.mode = PipelineModes.OTHER diff --git a/examples/llm/agents/README.md b/examples/llm/agents/README.md index 2721452a93..bb5c1d9bc3 100644 --- a/examples/llm/agents/README.md +++ b/examples/llm/agents/README.md @@ -104,7 +104,7 @@ export SERPAPI_API_KEY="" Install the required dependencies. ```bash -mamba env update \ +conda env update --solver=libmamba \ -n ${CONDA_DEFAULT_ENV} \ --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` @@ -131,6 +131,10 @@ python examples/llm/main.py agents simple [OPTIONS] ``` ### Options: +- `--use_cpu_only` + - **Description**: Run in CPU only mode + - **Default**: `False` + - `--num_threads INTEGER RANGE` - **Description**: Number of internal pipeline threads to use. - **Default**: `12` diff --git a/examples/llm/agents/run.py b/examples/llm/agents/run.py index b643926a2d..60d85eac84 100644 --- a/examples/llm/agents/run.py +++ b/examples/llm/agents/run.py @@ -25,6 +25,7 @@ def run(): @run.command(help="Runs a simple finite pipeline with a single execution of a LangChain agent from a fixed input") +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help="Run in CPU only mode") @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), diff --git a/examples/llm/agents/simple_pipeline.py b/examples/llm/agents/simple_pipeline.py index 78bfc00039..7fd7e1fdcb 100644 --- a/examples/llm/agents/simple_pipeline.py +++ b/examples/llm/agents/simple_pipeline.py @@ -15,13 +15,13 @@ import logging import time -import cudf - from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage from morpheus.utils.concat_df import concat_dataframes +from morpheus.utils.type_utils import get_df_class from .common import build_common_pipeline @@ -29,6 +29,7 @@ def pipeline( + use_cpu_only: bool, num_threads: int, pipeline_batch_size, model_max_batch_size, @@ -36,6 +37,7 @@ def pipeline( repeat_count, ) -> float: config = Config() + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU config.mode = PipelineModes.OTHER # Below properties are specified by the command line @@ -45,9 +47,9 @@ def pipeline( config.mode = PipelineModes.NLP config.edge_buffer_size = 128 + df_class = get_df_class(config.execution_mode) source_dfs = [ - cudf.DataFrame( - {"questions": ["Who is Leo DiCaprio's girlfriend? What is her current age raised to the 0.43 power?"]}) + df_class({"questions": ["Who is Leo DiCaprio's girlfriend? What is her current age raised to the 0.43 power?"]}) ] completion_task = {"task_type": "completion", "task_dict": {"input_keys": ["questions"], }} diff --git a/examples/llm/cli.py b/examples/llm/cli.py index 867d670345..3eae3f5ffb 100644 --- a/examples/llm/cli.py +++ b/examples/llm/cli.py @@ -35,23 +35,15 @@ type=click.Choice(get_log_levels(), case_sensitive=False), callback=parse_log_level, help="Specify the logging level to use.") -@click.option('--use_cpp', - default=True, - type=bool, - help=("Whether or not to use C++ node and message types or to prefer python. " - "Only use as a last resort if bugs are encountered")) @click.version_option() @click.pass_context -def cli(ctx: click.Context, log_level: int, use_cpp: bool): +def cli(ctx: click.Context, log_level: int): """Main entrypoint for the LLM Examples""" - from morpheus.config import CppConfig from morpheus.utils.logger import configure_logging ctx_dict = ctx.ensure_object(dict) - CppConfig.set_should_use_cpp(use_cpp) - # Configure the logging configure_logging(log_level=log_level) diff --git a/examples/llm/completion/README.md b/examples/llm/completion/README.md index e72ffe1ce6..562e1a1020 100644 --- a/examples/llm/completion/README.md +++ b/examples/llm/completion/README.md @@ -78,7 +78,7 @@ Before running the pipeline, ensure that the `NGC_API_KEY` environment variable Install the required dependencies. ```bash -mamba env update \ +conda env update --solver=libmamba \ -n ${CONDA_DEFAULT_ENV} \ --file ./conda/environments/examples_cuda-125_arch-x86_64.yaml ``` @@ -114,6 +114,9 @@ python examples/llm/main.py completion [OPTIONS] COMMAND [ARGS]... - `pipeline` ##### Options: +- `--use_cpu_only` + - **Description**: Run in CPU only mode + - **Default**: `False` - `--num_threads INTEGER RANGE` - **Description**: Number of internal pipeline threads to use. diff --git a/examples/llm/completion/pipeline.py b/examples/llm/completion/pipeline.py index 86b5df19d7..4087ce9ca6 100644 --- a/examples/llm/completion/pipeline.py +++ b/examples/llm/completion/pipeline.py @@ -15,9 +15,8 @@ import logging import time -import cudf - from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.config import PipelineModes from morpheus.io.deserializers import read_file_to_df from morpheus.pipeline.linear_pipeline import LinearPipeline @@ -26,6 +25,8 @@ from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage from morpheus.utils.concat_df import concat_dataframes +from morpheus.utils.type_utils import exec_mode_to_df_type_str +from morpheus.utils.type_utils import get_df_class from morpheus_llm.llm import LLMEngine from morpheus_llm.llm.nodes.extracter_node import ExtracterNode from morpheus_llm.llm.nodes.llm_generate_node import LLMGenerateNode @@ -71,7 +72,8 @@ def _build_engine(llm_service: str): return engine -def pipeline(num_threads: int, +def pipeline(use_cpu_only: bool, + num_threads: int, pipeline_batch_size: int, model_max_batch_size: int, repeat_count: int, @@ -80,6 +82,7 @@ def pipeline(num_threads: int, shuffle: bool = False) -> float: config = Config() + config.execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU # Below properties are specified by the command line config.num_threads = num_threads @@ -89,9 +92,10 @@ def pipeline(num_threads: int, config.edge_buffer_size = 128 if input_file is not None: - source_df = read_file_to_df(input_file, df_type='cudf') + source_df = read_file_to_df(input_file, df_type=exec_mode_to_df_type_str(config.execution_mode)) else: - source_df = cudf.DataFrame({ + df_class = get_df_class(config.execution_mode) + source_df = df_class({ "country": [ "France", "Spain", diff --git a/examples/llm/completion/run.py b/examples/llm/completion/run.py index 611a5105db..ed2e8a6c3d 100644 --- a/examples/llm/completion/run.py +++ b/examples/llm/completion/run.py @@ -26,6 +26,7 @@ def run(): @run.command() +@click.option('--use_cpu_only', default=False, type=bool, is_flag=True, help="Run in CPU only mode") @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), diff --git a/examples/llm/vdb_upload/run.py b/examples/llm/vdb_upload/run.py index f02ed5dfe0..b3099d845f 100644 --- a/examples/llm/vdb_upload/run.py +++ b/examples/llm/vdb_upload/run.py @@ -115,7 +115,7 @@ def run(): @click.option( "--triton_server_url", type=str, - default="localhost:8001", + default="localhost:8000", help="Triton server URL.", ) @click.option( diff --git a/examples/log_parsing/inference.py b/examples/log_parsing/inference.py index 67f4062409..099928cff9 100644 --- a/examples/log_parsing/inference.py +++ b/examples/log_parsing/inference.py @@ -19,7 +19,6 @@ import tritonclient.grpc as tritonclient from scipy.special import softmax -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes @@ -62,7 +61,7 @@ def build_output_message(self, msg: ControlMessage) -> ControlMessage: seq_ids[:, 0] = cp.arange(0, msg.tensors().count, dtype=cp.uint32) seq_ids[:, 2] = msg.tensors().get_tensor('seq_ids')[:, 2] - memory = _messages.TensorMemory( + memory = TensorMemory( count=msg.tensors().count, tensors={ 'confidences': cp.zeros((msg.tensors().count, self._inputs[list(self._inputs.keys())[0]].shape[1])), diff --git a/examples/ransomware_detection/README.md b/examples/ransomware_detection/README.md index 0388140227..4b15a30b71 100644 --- a/examples/ransomware_detection/README.md +++ b/examples/ransomware_detection/README.md @@ -68,7 +68,7 @@ Once Triton server finishes starting up, it will display the status of all loade Run the following from the root of the Morpheus repo to start the ransomware detection pipeline: ```bash -python examples/ransomware_detection/run.py --server_url=localhost:8001 \ +python examples/ransomware_detection/run.py --server_url=localhost:8000 \ --sliding_window=3 \ --model_name=ransomw-model-short-rf \ --input_glob=./examples/data/appshield/*/snapshot-*/*.json \ @@ -88,7 +88,6 @@ Usage: run.py [OPTIONS] Options: --debug BOOLEAN - --use_cpp BOOLEAN --num_threads INTEGER RANGE Number of internal pipeline threads to use [x>=1] --n_dask_workers INTEGER RANGE Number of dask workers [x>=2] diff --git a/examples/ransomware_detection/run.py b/examples/ransomware_detection/run.py index a89c7c93f2..a94fe301f6 100644 --- a/examples/ransomware_detection/run.py +++ b/examples/ransomware_detection/run.py @@ -20,7 +20,6 @@ import yaml from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.general.monitor_stage import MonitorStage @@ -39,7 +38,6 @@ @click.command() @click.option('--debug', default=False) -@click.option('--use_cpp', default=False, help="Enable C++ execution for this pipeline, currently this is unsupported.") @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), @@ -102,7 +100,6 @@ help="The path to the file where the inference output will be saved.", ) def run_pipeline(debug, - use_cpp, num_threads, n_dask_workers, threads_per_dask_worker, @@ -122,8 +119,6 @@ def run_pipeline(debug, snapshot_fea_length = 99 - CppConfig.set_should_use_cpp(use_cpp) - # Its necessary to get the global config object and configure it for FIL mode. config = Config() config.mode = PipelineModes.FIL @@ -205,6 +200,7 @@ def run_pipeline(debug, model_name=model_name, server_url=server_url, force_convert_inputs=True, + thread_count=1 # Work-around for issue #1891 remove once resolved. )) # Add a monitor stage. diff --git a/examples/ransomware_detection/stages/create_features.py b/examples/ransomware_detection/stages/create_features.py index 862747d9a6..3ca214caad 100644 --- a/examples/ransomware_detection/stages/create_features.py +++ b/examples/ransomware_detection/stages/create_features.py @@ -12,38 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. -import typing - import mrc +import pandas as pd from mrc.core import operators as ops from dask.distributed import Client +import cudf + from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.control_message_stage import ControlMessageStage -from morpheus.stages.input.appshield_source_stage import AppShieldMessageMeta +from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from common.data_models import FeatureConfig # pylint: disable=no-name-in-module # isort: skip from common.feature_extractor import FeatureExtractor # pylint: disable=no-name-in-module # isort: skip @register_stage("create-features", modes=[PipelineModes.FIL]) -class CreateFeaturesRWStage(ControlMessageStage): +class CreateFeaturesRWStage(PreallocatorMixin, ControlMessageStage): """ - This class extends ControlMessageStage to deal with scenario specific features from Appshiled plugins data. + Stage creates features from Appshiled plugins data. Parameters ---------- c : morpheus.config.Config Pipeline configuration instance - interested_plugins : typing.List[str] + interested_plugins : list[str] Only intrested plugins files will be read from Appshield snapshots - feature_columns : typing.List[str] + feature_columns : list[str] List of features needed to be extracted. - file_extns : typing.List[str] + file_extns : list[str] File extensions. n_workers: int, default = 2 Number of dask workers. @@ -54,9 +56,9 @@ class CreateFeaturesRWStage(ControlMessageStage): def __init__( self, c: Config, - interested_plugins: typing.List[str], - feature_columns: typing.List[str], - file_extns: typing.List[str], + interested_plugins: list[str], + feature_columns: list[str], + file_extns: list[str], n_workers: int = 2, threads_per_worker: int = 2, ): @@ -73,20 +75,23 @@ def __init__( def name(self) -> str: return "create-features-rw" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Returns accepted input types for this stage. """ - return (AppShieldMessageMeta, ) + return (ControlMessage, ) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def on_next(self, x: AppShieldMessageMeta): + def on_next(self, msg: ControlMessage) -> list[ControlMessage]: snapshot_fea_dfs = [] - df = x.df + with msg.payload().mutable_dataframe() as cdf: + df = cdf.to_pandas() + + msg_source = msg.get_metadata("source") # Type cast CommitCharge. df["CommitCharge"] = df["CommitCharge"].astype("float").astype("Int32") @@ -118,25 +123,23 @@ def on_next(self, x: AppShieldMessageMeta): # Snapshot sequence will be generated using `source_pid_process`. # Determines which source generated the snapshot messages. # There's a chance of receiving the same snapshots names from multiple sources(hosts) - features_df['source_pid_process'] = x.source + '_' + features_df.pid_process + features_df['source_pid_process'] = msg_source + '_' + features_df.pid_process + + # Cast int values to string preventing the df from converting to cuDF. + features_df['ldrmodules_df_path'] = features_df['ldrmodules_df_path'].astype(str) # Sort entries by pid_process and snapshot_id features_df = features_df.sort_values(by=["pid_process", "snapshot_id"]).reset_index(drop=True) - # Create AppShieldMessageMeta with extracted features information. - meta = AppShieldMessageMeta(features_df, x.source) + return self.split_messages(msg_source, features_df) - return meta + def split_messages(self, msg_source: str, df: pd.DataFrame) -> list[ControlMessage]: - def create_control_messages(self, app_shield_message_meta: AppShieldMessageMeta) -> typing.List[ControlMessage]: - - control_messages = [] - - df = app_shield_message_meta.df + output_messages = [] pid_processes = df.pid_process.unique() - # Create multi messaage per pid_process, this assumes that the DF has been sorted by the `pid_process` column + # Create a unique messaage per pid_process, this assumes the DF has been sorted by the `pid_process` column for pid_process in pid_processes: pid_process_index = df[df.pid_process == pid_process].index @@ -144,13 +147,15 @@ def create_control_messages(self, app_shield_message_meta: AppShieldMessageMeta) start = pid_process_index.min() stop = pid_process_index.max() + 1 - sliced_meta = app_shield_message_meta.get_slice(start, stop) - control_message = ControlMessage() - control_message.payload(sliced_meta) + cdf = cudf.DataFrame(df.iloc[start:stop]) + + out_msg = ControlMessage() + out_msg.payload(MessageMeta(cdf)) + out_msg.set_metadata("source", msg_source) - control_messages.append(control_message) + output_messages.append(out_msg) - return control_messages + return output_messages def on_completed(self): # Close dask client when pipeline initiates shutdown @@ -159,7 +164,6 @@ def on_completed(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: node = builder.make_node(self.unique_name, ops.map(self.on_next), - ops.map(self.create_control_messages), ops.on_completed(self.on_completed), ops.flatten()) builder.make_edge(input_node, node) diff --git a/examples/ransomware_detection/stages/preprocessing.py b/examples/ransomware_detection/stages/preprocessing.py index 3715f92425..68f6c8bc85 100644 --- a/examples/ransomware_detection/stages/preprocessing.py +++ b/examples/ransomware_detection/stages/preprocessing.py @@ -18,12 +18,12 @@ import mrc import pandas as pd -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemoryFIL from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage from common.data_models import SnapshotData # pylint: disable=no-name-in-module #isort:skip @@ -39,13 +39,13 @@ class PreprocessingRWStage(PreprocessBaseStage): ---------- c : morpheus.config.Config Pipeline configuration instance - feature_columns : typing.List[str] + feature_columns : list[str] List of features needed to be extracted. sliding_window: int, default = 3 Window size to arrange the sanpshots in seequential order. """ - def __init__(self, c: Config, feature_columns: typing.List[str], sliding_window: int = 3): + def __init__(self, c: Config, feature_columns: list[str], sliding_window: int = 3): super().__init__(c) @@ -54,7 +54,7 @@ def __init__(self, c: Config, feature_columns: typing.List[str], sliding_window: self._features_len = len(self._feature_columns) # Stateful member to hold unprocessed snapshots. - self._snapshot_dict: typing.Dict[str, typing.List[SnapshotData]] = {} + self._snapshot_dict: dict[str, list[SnapshotData]] = {} # Padding data to map inference response with input messages. self._padding_data = [0 for i in range(self._features_len * sliding_window)] @@ -64,11 +64,10 @@ def __init__(self, c: Config, feature_columns: typing.List[str], sliding_window: def name(self) -> str: return "preprocess-rw" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def _sliding_window_offsets(self, ids: typing.List[int], ids_len: int, - window: int) -> typing.List[typing.Tuple[int]]: + def _sliding_window_offsets(self, ids: list[int], ids_len: int, window: int) -> list[tuple[int]]: """ Create snapshot_id's sliding sequence for a given window """ @@ -86,10 +85,7 @@ def _sliding_window_offsets(self, ids: typing.List[int], ids_len: int, return sliding_window_offsets - def _rollover_pending_snapshots(self, - snapshot_ids: typing.List[int], - source_pid_process: str, - snapshot_df: pd.DataFrame): + def _rollover_pending_snapshots(self, snapshot_ids: list[int], source_pid_process: str, snapshot_df: pd.DataFrame): """ Store the unprocessed snapshots from current run to a stateful member to process them in the next run. """ @@ -130,7 +126,9 @@ def _pre_process_batch(self, msg: ControlMessage) -> ControlMessage: Current run's unprocessed snapshots will be rolled over to the next. """ - snapshot_df = msg.payload().df + meta = msg.payload() + snapshot_df = meta.copy_dataframe().to_pandas() + curr_snapshots_size = len(snapshot_df) # Set snapshot_id as index this is used to get ordered snapshots based on sliding window. @@ -174,19 +172,18 @@ def _pre_process_batch(self, msg: ControlMessage) -> ControlMessage: self._rollover_pending_snapshots(snapshot_ids, source_pid_process, snapshot_df) # This column is used to identify whether sequence is genuine or dummy - msg.payload().set_data('sequence', sequence) + meta.set_data('sequence', sequence) # Convert data to cupy array data = cp.asarray(data) - seg_ids = cp.zeros((curr_snapshots_size, 3), dtype=cp.uint32) - seg_ids[:, 0] = cp.arange(0, curr_snapshots_size, dtype=cp.uint32) - seg_ids[:, 2] = self._features_len * 3 + seq_ids = cp.zeros((curr_snapshots_size, 3), dtype=cp.uint32) + seq_ids[:, 0] = cp.arange(0, curr_snapshots_size, dtype=cp.uint32) + seq_ids[:, 2] = self._features_len * 3 - memory = _messages.InferenceMemoryFIL(count=curr_snapshots_size, input__0=data, seq_ids=seg_ids) - msg.tensors(memory) + memory = InferenceMemoryFIL(count=curr_snapshots_size, input__0=data, seq_ids=seq_ids) msg.set_metadata("inference_memory_params", {"inference_type": "fil"}) - + msg.tensors(memory) return msg def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: diff --git a/examples/root_cause_analysis/README.md b/examples/root_cause_analysis/README.md index 5d038fa959..943c00fad2 100644 --- a/examples/root_cause_analysis/README.md +++ b/examples/root_cause_analysis/README.md @@ -105,8 +105,8 @@ From the Morpheus repo root directory, run: ```bash export MORPHEUS_ROOT=$(pwd) morpheus --log_level=DEBUG \ -`# Run a pipeline with 5 threads and a model batch size of 32 (Must match Triton config)` \ -run --num_threads=8 --edge_buffer_size=4 --use_cpp=True --pipeline_batch_size=1024 --model_max_batch_size=32 \ +`# Run a pipeline with 8 threads and a model batch size of 32 (Must match Triton config)` \ +run --num_threads=8 --edge_buffer_size=4 --pipeline_batch_size=1024 --model_max_batch_size=32 \ `# Specify a NLP pipeline with 128 sequence length (Must match Triton config)` \ pipeline-nlp --model_seq_length=128 --label=not_root_cause --label=is_root_cause \ `# 1st Stage: Read from file` \ diff --git a/examples/sid_visualization/README.md b/examples/sid_visualization/README.md index 10aeb4cbee..c1d88b25b4 100644 --- a/examples/sid_visualization/README.md +++ b/examples/sid_visualization/README.md @@ -96,7 +96,7 @@ After the GUI has been launched, Morpheus now needs to be started. In the same s ```bash python examples/sid_visualization/run.py \ --debug \ - --triton_server_url=triton:8001 \ + --triton_server_url=triton:8000 \ --input_file=./examples/data/sid_visualization/group1-benign-2nodes.jsonlines \ --input_file=./examples/data/sid_visualization/group2-benign-50nodes.jsonlines \ --input_file=./examples/data/sid_visualization/group3-si-50nodes.jsonlines \ @@ -147,7 +147,7 @@ morpheus --log_level=DEBUG \ pipeline-nlp --model_seq_length=256 \ from-file --filename=${DEMO_DATASET} \ deserialize \ - preprocess --vocab_hash_file=morpheus/data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \ + preprocess --vocab_hash_file=data/bert-base-uncased-hash.txt --truncation=True --do_lower_case=True --add_special_tokens=False \ inf-triton --model_name=sid-minibert-onnx --server_url=triton:8001 --force_convert_inputs=True \ monitor --description Inference\ Rate --unit=inf \ add-class \ diff --git a/examples/sid_visualization/run.py b/examples/sid_visualization/run.py index 4db84fac11..6e58a92791 100644 --- a/examples/sid_visualization/run.py +++ b/examples/sid_visualization/run.py @@ -21,7 +21,6 @@ from morpheus.common import FileTypes from morpheus.config import Config -from morpheus.config import CppConfig from morpheus.config import PipelineModes from morpheus.io.deserializers import read_file_to_df from morpheus.messages import MessageMeta @@ -120,7 +119,6 @@ def _generate_frames(self): @click.command() @click.option("--debug/--no-debug", default=False) -@click.option('--use_cpp', default=True) @click.option( "--num_threads", default=len(os.sched_getaffinity(0)), @@ -148,16 +146,14 @@ def _generate_frames(self): default="sid-minibert-onnx", help="The name of the model that is deployed on Tritonserver.", ) -@click.option("--triton_server_url", default="localhost:8001", required=True, help="Tritonserver url.") -def run_pipeline(debug, use_cpp, num_threads, input_file, max_batch_size, model_name, triton_server_url): +@click.option("--triton_server_url", default="localhost:8000", required=True, help="Tritonserver url.") +def run_pipeline(debug, num_threads, input_file, max_batch_size, model_name, triton_server_url): if debug: configure_logging(log_level=logging.DEBUG) else: configure_logging(log_level=logging.INFO) - CppConfig.set_should_use_cpp(use_cpp) - # Its necessary to get the global config object and configure it for FIL mode. config = Config() config.mode = PipelineModes.NLP diff --git a/external/utilities b/external/utilities index 722c1352a0..85f8f7af2e 160000 --- a/external/utilities +++ b/external/utilities @@ -1 +1 @@ -Subproject commit 722c1352a0e9b9f606d343714cee88578c12e455 +Subproject commit 85f8f7af2e8d9bc7bde978cd40c40297b1116957 diff --git a/models/model-cards/dfp-model-card.md b/models/model-cards/dfp-model-card.md index 71c0eebc04..88b453d254 100644 --- a/models/model-cards/dfp-model-card.md +++ b/models/model-cards/dfp-model-card.md @@ -52,7 +52,7 @@ The model architecture consists of an Autoencoder, where the reconstruction loss * Reconstruction loss (per feature) **Output Parameters:** -* Pandas DataFrame +* pandas DataFrame ## Software Integration: **Runtime:** diff --git a/morpheus.code-workspace b/morpheus.code-workspace index 24f34efff7..0b3edb2aee 100644 --- a/morpheus.code-workspace +++ b/morpheus.code-workspace @@ -242,6 +242,18 @@ "request": "launch", "type": "debugpy" }, + { + "args": [ + "-x" + ], + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + "justMyCode": false, + "module": "pytest", + "name": "Python: tests", + "request": "launch", + "type": "debugpy" + }, { "MIMode": "gdb", "args": [ diff --git a/pyproject.toml b/pyproject.toml index 678c060041..d50c73630f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,8 +15,9 @@ markers = [ "slow: Slow tests", "kafka: Tests that require a running instance of kafka", "milvus: Tests that require a running instance of milvus", - "use_cpp: Test support C++ nodes and objects", - "use_python: Test only supports Python nodes and objects", + "gpu_mode: Test support GPU nodes and objects", + "cpu_mode: Test only supports CPU nodes and objects", + "gpu_and_cpu_mode: Test supports both GPU and CPU nodes and objects", "use_cudf: Test supports cuDF datasets", "use_pandas: Test supports Pandas datasets", "replace_callback: Replaces the results_callback in cli", diff --git a/python/morpheus/morpheus/_lib/common/module.cpp b/python/morpheus/morpheus/_lib/common/module.cpp index c36d349bac..f9ba779f10 100644 --- a/python/morpheus/morpheus/_lib/common/module.cpp +++ b/python/morpheus/morpheus/_lib/common/module.cpp @@ -28,7 +28,6 @@ #include "morpheus/objects/filter_source.hpp" #include "morpheus/objects/tensor_object.hpp" // for TensorObject #include "morpheus/objects/wrapped_tensor.hpp" -#include "morpheus/utilities/cudf_util.hpp" #include "morpheus/utilities/http_server.hpp" #include "morpheus/version.hpp" @@ -58,9 +57,6 @@ PYBIND11_MODULE(common, _module) :toctree: _generate )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - LoaderRegistry::register_factory_fn( "file", [](nlohmann::json config) { diff --git a/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp b/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp index 22e668cfe3..c10bdc4a78 100644 --- a/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp +++ b/python/morpheus/morpheus/_lib/include/morpheus/messages/control.hpp @@ -24,13 +24,16 @@ #include // for object, dict, list #include // IWYU pragma: keep -#include // for system_clock, time_point +// for system_clock, time_point +#include // IWYU pragma: keep #include // for map #include // for shared_ptr #include // for optional #include // for string #include // for vector +// IWYU pragma: no_include + namespace morpheus { enum class MORPHEUS_EXPORT ControlMessageType @@ -40,128 +43,6 @@ enum class MORPHEUS_EXPORT ControlMessageType TRAINING }; -// class PayloadManager -// { -// public: -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// TensorObject& get_tensor(const std::string& name) -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return const TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// const TensorObject& get_tensor(const std::string& name) const -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Set the tensor object identified by `name` -// * -// * @param name -// * @param tensor -// * @throws std::length_error If the number of rows in `tensor` does not match `count`. -// */ -// void set_tensor(const std::string& name, TensorObject&& tensor) -// { -// m_tensors->set_tensor(name, std::move(tensor)); -// } - -// /** -// * @brief Get a reference to the internal tensors map -// * -// * @return const TensorMap& -// */ -// const TensorMap& get_tensors() const -// { -// return m_tensors->get_tensors(); -// } - -// /** -// * @brief Set the tensors object -// * -// * @param tensors -// * @throws std::length_error If the number of rows in the `tensors` do not match `count`. -// */ -// void set_tensors(TensorMap&& tensors) -// { -// m_tensors->set_tensors(std::move(tensors)); -// } - -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// TensorObject& get_column(const std::string& name) -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Get the tensor object identified by `name` -// * -// * @param name -// * @return const TensorObject& -// * @throws std::runtime_error If no tensor matching `name` exists -// */ -// const TensorObject& get_column(const std::string& name) const -// { -// return m_tensors->get_tensor(name); -// } - -// /** -// * @brief Set the tensor object identified by `name` -// * -// * @param name -// * @param tensor -// * @throws std::length_error If the number of rows in `tensor` does not match `count`. -// */ -// void set_column(const std::string& name, TensorObject&& tensor) -// { -// m_tensors->set_tensor(name, std::move(tensor)); -// } - -// /** -// * @brief Get a reference to the internal tensors map -// * -// * @return const TensorMap& -// */ -// TableInfo get_columns() const -// { -// return m_df->get_info(); -// } - -// /** -// * @brief Set the tensors object -// * -// * @param tensors -// * @throws std::length_error If the number of rows in the `tensors` do not match `count`. -// */ -// void set_columns(TableInfo&& tensors) -// { -// m_tensors->set_tensors(std::move(tensors)); -// } - -// private: -// std::shared_ptr m_df; -// std::shared_ptr m_tensors; -// }; - class MORPHEUS_EXPORT TensorMemory; // System-clock for better compatibility with pybind11/chrono @@ -369,6 +250,8 @@ class MORPHEUS_EXPORT ControlMessage static const std::string s_config_schema; // NOLINT static std::map s_task_type_map; // NOLINT + ControlMessageType to_task_type(const std::string& task_type, bool throw_on_error) const; + ControlMessageType m_cm_type{ControlMessageType::NONE}; std::shared_ptr m_payload{nullptr}; std::shared_ptr m_tensors{nullptr}; @@ -382,11 +265,13 @@ class MORPHEUS_EXPORT ControlMessage struct MORPHEUS_EXPORT ControlMessageProxy { /** - * @brief Creates a new ControlMessage instance from a configuration dictionary. - * @param config A pybind11::dict representing the configuration for the ControlMessage. + * @brief Creates a new ControlMessage instance from either a Python instance of a ControlMessage or a configuration + * dictionary. + * @param config_or_message Either a Python instance of a ControlMessage or a dict representing the configuration + * for the ControlMessage. * @return A shared_ptr to a newly created ControlMessage instance. */ - static std::shared_ptr create(pybind11::dict& config); + static std::shared_ptr create(pybind11::object& config_or_message); /** * @brief Creates a new ControlMessage instance as a copy of an existing one. diff --git a/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp b/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp index 5eb6636919..7a87620b90 100644 --- a/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp +++ b/python/morpheus/morpheus/_lib/include/morpheus/utilities/cudf_util.hpp @@ -40,7 +40,7 @@ namespace morpheus { struct CudfHelper { public: - __attribute__((visibility("default"))) static void load(); + static void load(); /** * @brief Converts a C++ table to a Python DataTable object @@ -67,6 +67,9 @@ struct CudfHelper * @return TableInfoData */ static TableInfoData table_info_data_from_table(pybind11::object table); + + private: + CudfHelper(); }; /** @} */ // end of group diff --git a/python/morpheus/morpheus/_lib/messages/__init__.pyi b/python/morpheus/morpheus/_lib/messages/__init__.pyi index 74d7a522d4..4974b93daf 100644 --- a/python/morpheus/morpheus/_lib/messages/__init__.pyi +++ b/python/morpheus/morpheus/_lib/messages/__init__.pyi @@ -36,7 +36,7 @@ class ControlMessage(): @typing.overload def __init__(self, arg0: ControlMessage) -> None: ... @typing.overload - def __init__(self, arg0: dict) -> None: ... + def __init__(self, arg0: object) -> None: ... def add_task(self, task_type: str, task: object | None) -> None: ... @typing.overload def config(self) -> object | None: ... diff --git a/python/morpheus/morpheus/_lib/messages/module.cpp b/python/morpheus/morpheus/_lib/messages/module.cpp index fdc5fce73b..af559bde58 100644 --- a/python/morpheus/morpheus/_lib/messages/module.cpp +++ b/python/morpheus/morpheus/_lib/messages/module.cpp @@ -29,8 +29,7 @@ #include "morpheus/messages/raw_packet.hpp" #include "morpheus/objects/data_table.hpp" #include "morpheus/objects/mutable_table_ctx_mgr.hpp" -#include "morpheus/pybind11/json.hpp" // IWYU pragma: keep -#include "morpheus/utilities/cudf_util.hpp" +#include "morpheus/pybind11/json.hpp" // IWYU pragma: keep #include "morpheus/utilities/json_types.hpp" // for json_t #include "morpheus/utilities/string_util.hpp" #include "morpheus/version.hpp" @@ -127,10 +126,7 @@ PYBIND11_MODULE(messages, _module) )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - - mrc::pymrc::import(_module, "cupy"); + mrc::pymrc::import(_module, "cupy"); // It should be safe to import cupy in CPU only mode mrc::pymrc::import(_module, "morpheus._lib.common"); // Required for SegmentObject @@ -256,8 +252,8 @@ PYBIND11_MODULE(messages, _module) py::class_>(_module, "ControlMessage") .def(py::init<>()) - .def(py::init(py::overload_cast(&ControlMessageProxy::create))) .def(py::init(py::overload_cast>(&ControlMessageProxy::create))) + .def(py::init(py::overload_cast(&ControlMessageProxy::create))) .def("add_task", &ControlMessage::add_task, py::arg("task_type"), py::arg("task")) .def( "config", py::overload_cast(&ControlMessage::config), py::arg("config")) diff --git a/python/morpheus/morpheus/_lib/src/messages/control.cpp b/python/morpheus/morpheus/_lib/src/messages/control.cpp index ca23c5f9f8..d20334c35a 100644 --- a/python/morpheus/morpheus/_lib/src/messages/control.cpp +++ b/python/morpheus/morpheus/_lib/src/messages/control.cpp @@ -17,20 +17,25 @@ #include "morpheus/messages/control.hpp" -#include "morpheus/messages/meta.hpp" // for MessageMeta, MessageMetaInterfaceProxy - -#include // for COMPACT_GOOGLE_LOG_INFO, LogMessage, VLOG -#include // for basic_json, json_ref, iter_impl, operator<< -#include // IWYU pragma: keep -#include // for cast, object::cast -#include // for object, none, dict, isinstance, list, str, value_error, generic_item -#include // for cast_from_pyobject +#include "morpheus/messages/memory/tensor_memory.hpp" // for TensorMemory, TensorMemoryInterfaceProxy +#include "morpheus/messages/meta.hpp" // for MessageMeta, MessageMetaInterfaceProxy +#include "morpheus/types.hpp" // for TensorIndex + +#include // for to_lower_copy +#include // for COMPACT_GOOGLE_LOG_INFO, LogMessage, VLOG +#include // for basic_json, json_ref, iter_impl, operator<< +#include // IWYU pragma: keep +#include // for cast, object::cast +#include // for object, none, dict, isinstance, list, str, value_error, generic_item +#include // IWYU pragma: keep +#include // for cast_from_pyobject #include // for optional, nullopt #include // for basic_ostream, operator<< #include // for regex_search, regex #include // for runtime_error #include // for pair +// IWYU pragma: no_include namespace py = pybind11; using namespace py::literals; @@ -40,6 +45,7 @@ namespace morpheus { const std::string ControlMessage::s_config_schema = R"()"; std::map ControlMessage::s_task_type_map{{"inference", ControlMessageType::INFERENCE}, + {"none", ControlMessageType::NONE}, {"training", ControlMessageType::TRAINING}}; ControlMessage::ControlMessage() : m_config({{"metadata", morpheus::utilities::json_t::object()}}), m_tasks({}) {} @@ -53,8 +59,14 @@ ControlMessage::ControlMessage(const morpheus::utilities::json_t& _config) : ControlMessage::ControlMessage(const ControlMessage& other) { + m_cm_type = other.m_cm_type; + m_payload = other.m_payload; + m_tensors = other.m_tensors; + m_config = other.m_config; m_tasks = other.m_tasks; + + m_timestamps = other.m_timestamps; } const morpheus::utilities::json_t& ControlMessage::config() const @@ -65,16 +77,19 @@ const morpheus::utilities::json_t& ControlMessage::config() const void ControlMessage::add_task(const std::string& task_type, const morpheus::utilities::json_t& task) { VLOG(20) << "Adding task of type " << task_type << " to control message" << task.dump(4); - auto _task_type = s_task_type_map.contains(task_type) ? s_task_type_map[task_type] : ControlMessageType::NONE; - - if (this->task_type() == ControlMessageType::NONE) - { - this->task_type(_task_type); - } + auto _task_type = to_task_type(task_type, false); - if (_task_type != ControlMessageType::NONE and this->task_type() != _task_type) + if (_task_type != ControlMessageType::NONE) { - throw std::runtime_error("Cannot add inference and training tasks to the same control message"); + auto current_task_type = this->task_type(); + if (current_task_type == ControlMessageType::NONE) + { + this->task_type(_task_type); + } + else if (current_task_type != _task_type) + { + throw std::runtime_error("Cannot mix different types of tasks on the same control message"); + } } m_tasks[task_type].push_back(task); @@ -197,14 +212,7 @@ void ControlMessage::config(const morpheus::utilities::json_t& config) { if (config.contains("type")) { - auto task_type = config.at("type"); - auto _task_type = - s_task_type_map.contains(task_type) ? s_task_type_map.at(task_type) : ControlMessageType::NONE; - - if (this->task_type() == ControlMessageType::NONE) - { - this->task_type(_task_type); - } + this->task_type(to_task_type(config.at("type").get(), true)); } if (config.contains("tasks")) @@ -256,10 +264,65 @@ void ControlMessage::task_type(ControlMessageType type) m_cm_type = type; } +ControlMessageType ControlMessage::to_task_type(const std::string& task_type, bool throw_on_error) const +{ + auto lower_task_type = boost::to_lower_copy(task_type); + if (ControlMessage::s_task_type_map.contains(lower_task_type)) + { + return ControlMessage::s_task_type_map.at(lower_task_type); + } + + if (throw_on_error) + { + throw std::runtime_error("Invalid task type: " + task_type); + } + + return ControlMessageType::NONE; +} + /*** Proxy Implementations ***/ -std::shared_ptr ControlMessageProxy::create(py::dict& config) +std::shared_ptr ControlMessageProxy::create(py::object& config_or_message) { - return std::make_shared(mrc::pymrc::cast_from_pyobject(config)); + if (config_or_message.is_none()) + { + return std::make_shared(); + } + + if (py::isinstance(config_or_message)) + { + return std::make_shared(mrc::pymrc::cast_from_pyobject(config_or_message)); + } + + // Assume we received an instance of the Python impl of ControlMessage object, as a Python bound instance of the C++ + // impl of the ControlMessage class would have invoked the shared_ptr overload of the create method + py::dict config = config_or_message.attr("_export_config")(); + auto cm = std::make_shared(mrc::pymrc::cast_from_pyobject(config)); + + auto py_meta = config_or_message.attr("payload")(); + if (!py_meta.is_none()) + { + cm->payload(MessageMetaInterfaceProxy::init_python_meta(py_meta)); + } + + auto py_tensors = config_or_message.attr("tensors")(); + if (!py_tensors.is_none()) + { + auto count = py_tensors.attr("count").cast(); + auto py_tensors_map = py_tensors.attr("get_tensors")(); + cm->tensors(TensorMemoryInterfaceProxy::init(count, py_tensors_map)); + } + + auto py_timestamps = config_or_message.attr("_timestamps"); + if (!py_timestamps.is_none()) + { + auto timestamps_map = py_timestamps.cast>(); + for (const auto& t : timestamps_map) + { + cm->set_timestamp(t.first, t.second); + } + } + + return cm; } std::shared_ptr ControlMessageProxy::create(std::shared_ptr other) diff --git a/python/morpheus/morpheus/_lib/src/messages/meta.cpp b/python/morpheus/morpheus/_lib/src/messages/meta.cpp index 7426bf7a5d..8b37633612 100644 --- a/python/morpheus/morpheus/_lib/src/messages/meta.cpp +++ b/python/morpheus/morpheus/_lib/src/messages/meta.cpp @@ -252,7 +252,7 @@ std::shared_ptr MessageMetaInterfaceProxy::init_python(py::object&& auto cudf_df_cls = py::module_::import("cudf").attr("DataFrame"); if (!py::isinstance(data_frame, cudf_df_cls)) { - // Convert to cudf if it's a Pandas DF, thrown an error otherwise + // Check if we received a Pandas DF or the Python impl of MessageMeta, throw an error otherwise auto pd_df_cls = py::module_::import("pandas").attr("DataFrame"); if (py::isinstance(data_frame, pd_df_cls)) { @@ -265,6 +265,7 @@ std::shared_ptr MessageMetaInterfaceProxy::init_python(py::object&& auto msg_meta_cls = py::module_::import("morpheus.messages").attr("MessageMeta"); if (py::isinstance(data_frame, msg_meta_cls)) { + DVLOG(10) << "Converting from a Python impl of MessageMeta to C++ impl"; return init_python_meta(data_frame); } else diff --git a/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp b/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp index fbc86ad0d2..2e1c98a84d 100644 --- a/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp +++ b/python/morpheus/morpheus/_lib/src/utilities/cudf_util.cpp @@ -38,7 +38,7 @@ namespace morpheus { -void CudfHelper::load() +CudfHelper::CudfHelper() { // Avoid loading cudf_helpers if we are in a sphinx build if (std::getenv("MORPHEUS_IN_SPHINX_BUILD") == nullptr) @@ -53,14 +53,21 @@ void CudfHelper::load() } } +void CudfHelper::load() +{ + static CudfHelper s; +} + pybind11::object proxy_table_from_table_with_metadata(cudf::io::table_with_metadata&& table, int index_col_count) { + CudfHelper::load(); return pybind11::reinterpret_steal( (PyObject*)make_table_from_table_with_metadata(std::move(table), index_col_count)); } morpheus::TableInfoData proxy_table_info_data_from_table(pybind11::object table) { + CudfHelper::load(); return make_table_info_data_from_table(table.ptr()); } @@ -71,6 +78,7 @@ pybind11::object CudfHelper::table_from_table_with_metadata(cudf::io::table_with pybind11::object CudfHelper::table_from_table_info(const TableInfoBase& table_info) { + CudfHelper::load(); // Get the table info data from the table_into auto table_info_data = table_info.get_data(); diff --git a/python/morpheus/morpheus/_lib/stages/module.cpp b/python/morpheus/morpheus/_lib/stages/module.cpp index 51add3410e..266455177e 100644 --- a/python/morpheus/morpheus/_lib/stages/module.cpp +++ b/python/morpheus/morpheus/_lib/stages/module.cpp @@ -31,7 +31,6 @@ #include "morpheus/stages/preprocess_nlp.hpp" // for PreprocessNLPStage, PreprocessNLPStageInterfaceProxy #include "morpheus/stages/serialize.hpp" // for SerializeStage, SerializeStageInterfaceProxy #include "morpheus/stages/write_to_file.hpp" // for WriteToFileStage, WriteToFileStageInterfaceProxy -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper #include "morpheus/utilities/http_server.hpp" // for DefaultMaxPayloadSize #include "morpheus/version.hpp" // for morpheus_VERSION_MAJOR, morpheus_VERSION_MINOR, morp... @@ -64,9 +63,6 @@ PYBIND11_MODULE(stages, _module) )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - // Make sure to load mrc.core.segment to get ObjectProperties mrc::pymrc::import(_module, "mrc.core.segment"); diff --git a/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp b/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp index 780ad48b37..94fd26aae3 100644 --- a/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp +++ b/python/morpheus/morpheus/_lib/tests/messages/test_dev_doc_ex3.cpp @@ -17,10 +17,9 @@ #include "../test_utils/common.hpp" // IWYU pragma: associated -#include "morpheus/messages/control.hpp" // for ControlMessage -#include "morpheus/messages/meta.hpp" // for MessageMeta -#include "morpheus/objects/table_info.hpp" // for MutableTableInfo -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper +#include "morpheus/messages/control.hpp" // for ControlMessage +#include "morpheus/messages/meta.hpp" // for MessageMeta +#include "morpheus/objects/table_info.hpp" // for MutableTableInfo #include #include // for gil_scoped_release, gil_scoped_acquire @@ -34,20 +33,7 @@ using namespace morpheus; using namespace morpheus::test; class TestDevDocEx3 : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - CudfHelper::load(); - } - } -}; +{}; TEST_F(TestDevDocEx3, TestPyObjFromMultiMesg) { diff --git a/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp b/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp index cf53f6ea2a..d1ca4a8dcb 100644 --- a/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp +++ b/python/morpheus/morpheus/_lib/tests/messages/test_messages.hpp @@ -19,26 +19,11 @@ #include "../test_utils/common.hpp" // IWYU pragma: associated -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper - #include namespace morpheus::test { class TestMessages : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - CudfHelper::load(); - } - } -}; +{}; } // namespace morpheus::test diff --git a/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp b/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp index cbefd8355e..27f477511b 100644 --- a/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp +++ b/python/morpheus/morpheus/_lib/tests/stages/test_triton_inference_stage.cpp @@ -27,7 +27,6 @@ #include "morpheus/stages/inference_client_stage.hpp" // for TensorModelMapping, InferenceClientStage, IInferenceCl... #include "morpheus/stages/triton_inference.hpp" // for TritonInferenceClient, TritonInferInput, TritonInferRe... #include "morpheus/types.hpp" // for TensorMap -#include "morpheus/utilities/cudf_util.hpp" // for CudfHelper #include "morpheus/utilities/matx_util.hpp" // for MatxUtil #include // for cudaMemcpy, cudaMemcpyKind @@ -43,7 +42,6 @@ #include // for Error, InferOptions, InferenceServerHttpClient, InferR... #include // for Task #include // for TestScheduler -#include // for gil_scoped_acquire #include // for cuda_stream_per_thread #include // for device_buffer #include // for get_current_device_resource @@ -291,20 +289,7 @@ class ErrorProneTritonClient : public FakeTritonClient }; class TestTritonInferenceStage : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - morpheus::CudfHelper::load(); - } - } -}; +{}; cudf::io::table_with_metadata create_test_table_with_metadata(uint32_t rows) { diff --git a/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp b/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp index 552e5bb8a7..55b5465ae3 100644 --- a/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp +++ b/python/morpheus/morpheus/_lib/tests/test_file_in_out.cpp @@ -20,7 +20,6 @@ #include "morpheus/io/deserializers.hpp" #include "morpheus/io/serializers.hpp" #include "morpheus/messages/meta.hpp" -#include "morpheus/utilities/cudf_util.hpp" #include #include @@ -48,20 +47,7 @@ std::string read_file(const std::filesystem::path& file_path) } class TestFileInOut : public morpheus::test::TestWithPythonInterpreter -{ - protected: - void SetUp() override - { - morpheus::test::TestWithPythonInterpreter::SetUp(); - { - pybind11::gil_scoped_acquire gil; - - // Initially I ran into an issue bootstrapping cudf, I was able to work-around the issue, details in: - // https://github.com/rapidsai/cudf/issues/12862 - CudfHelper::load(); - } - } -}; +{}; TEST_F(TestFileInOut, RoundTripCSV) { diff --git a/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp b/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp index 1c8eb86fa8..c58f708b6e 100644 --- a/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp +++ b/python/morpheus/morpheus/_lib/tests/test_utils/common.cpp @@ -23,7 +23,6 @@ #include "morpheus/io/loaders/payload.hpp" #include "morpheus/io/loaders/rest.hpp" #include "morpheus/messages/meta.hpp" -#include "morpheus/utilities/cudf_util.hpp" #include "morpheus/utilities/string_util.hpp" #include // for PyStatus_Exception, PyConfig_Clear, PyConfig_InitPythonConfig @@ -81,9 +80,6 @@ void TestWithPythonInterpreter::SetUp() false); pybind11::gil_scoped_acquire gil; - - // Ensure that the cudf helpers are loaded so we can convert dataframes to MessageMeta - CudfHelper::load(); } void TestWithPythonInterpreter::TearDown() {} diff --git a/python/morpheus/morpheus/cli/commands.py b/python/morpheus/morpheus/cli/commands.py index 1b885ecd81..ddc61af5bf 100644 --- a/python/morpheus/morpheus/cli/commands.py +++ b/python/morpheus/morpheus/cli/commands.py @@ -33,6 +33,7 @@ from morpheus.config import ConfigFIL from morpheus.config import ConfigOnnxToTRT from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.config import PipelineModes from morpheus.utils.file_utils import load_labels_file from morpheus.utils.logger import configure_logging @@ -282,8 +283,14 @@ def install(**kwargs): @click.option('--use_cpp', default=True, type=bool, - help=("Whether or not to use C++ node and message types or to prefer python. " - "Only use as a last resort if bugs are encountered")) + help=("[Deprecated] Whether or not to use C++ node and message types or to prefer python. " + "Only use as a last resort if bugs are encountered. Cannot be used with --use_cpu_only")) +@click.option('--use_cpu_only', + default=False, + type=bool, + is_flag=True, + help=("Whether or not to run in CPU only mode, setting this to True will disable C++ mode. " + "Cannot be used with --use_cpp")) @click.option('--manual_seed', default=None, type=click.IntRange(min=1), @@ -292,8 +299,26 @@ def install(**kwargs): @prepare_command(parse_config=True) def run(ctx: click.Context, **kwargs): """Run subcommand, used for running a pipeline""" - # Since the option isnt the same name as `should_use_cpp` anymore, manually set the value here. - CppConfig.set_should_use_cpp(kwargs.pop("use_cpp", CppConfig.get_should_use_cpp())) + + if (ctx.get_parameter_source("use_cpu_only") is not click.core.ParameterSource.DEFAULT + and ctx.get_parameter_source("use_cpp") is not click.core.ParameterSource.DEFAULT): + # If the user set explicit values for both use_cpu_only and use_cpp raise an error + raise click.UsageError("Cannot set both --use_cpp and --use_cpu_only. The --use_cpp flag is deprecated. " + "Use only --use_cpu_only.") + + use_cpu_only = kwargs.pop("use_cpu_only") + use_cpp = kwargs.pop("use_cpp") + + # only check this value if the flag was explicitly set by the user + if ctx.get_parameter_source("use_cpp") is not click.core.ParameterSource.DEFAULT: + logger.warning("The --use_cpp flag is deprecated and will be removed in a future release") + + execution_mode = ExecutionMode.GPU if use_cpp else ExecutionMode.CPU + else: + execution_mode = ExecutionMode.CPU if use_cpu_only else ExecutionMode.GPU + + config = get_config_from_ctx(ctx) + config.execution_mode = execution_mode manual_seed_val = kwargs.pop("manual_seed", None) if manual_seed_val is not None: diff --git a/python/morpheus/morpheus/config.py b/python/morpheus/morpheus/config.py index 15e0416819..2b0073103e 100644 --- a/python/morpheus/morpheus/config.py +++ b/python/morpheus/morpheus/config.py @@ -140,6 +140,11 @@ class PipelineModes(str, Enum): AE = "AE" +class ExecutionMode(str, Enum): + GPU = "GPU" + CPU = "CPU" + + class CppConfig: """ Allows setting whether C++ implementations should be used for Morpheus stages and messages. Defaults to True, @@ -199,6 +204,7 @@ class Config(ConfigBase): log_config_file : str File corresponding to this Config. """ + execution_mode: ExecutionMode = ExecutionMode.GPU # Whether in Debug mode. debug: bool = False @@ -219,6 +225,41 @@ class Config(ConfigBase): ae: ConfigAutoEncoder = dataclasses.field(default=None) fil: ConfigFIL = dataclasses.field(default=None) + frozen: bool = False + + def freeze(self): + """ + Freeze the Config object, making it immutable. This method will be invoked when the config object is passed to + a pipeline or stage for the first time. + + Calling `freeze` on a frozen instance will not have any effect. + """ + self._check_cpp_mode(fix_mis_match=not self.frozen) + if not self.frozen: + self.frozen = True + + def _check_cpp_mode(self, fix_mis_match: bool = False): + """ + Check if C++ mode matched the execution mode. If ` + + Parameters + ---------- + fix_mis_match : bool + If True, set the C++ mode to the correct value. If False, raise an exception if the value is incorrect. + """ + should_use_cpp: bool = (self.execution_mode == ExecutionMode.GPU) + if fix_mis_match: + CppConfig.set_should_use_cpp(should_use_cpp) + elif CppConfig.get_should_use_cpp() != should_use_cpp: + raise ValueError( + f"Execution mode {self.execution_mode} does not match C++ mode {CppConfig.get_should_use_cpp()}") + + def __setattr__(self, name, value): + # Since __frozen is defined in the __post_init__, the attribute won't exist in the __init__ method. + if self.frozen: + raise dataclasses.FrozenInstanceError("Cannot modify frozen Config object.") + + super().__setattr__(name, value) @property def pipeline_batch_size(self): diff --git a/python/morpheus/morpheus/controllers/file_to_df_controller.py b/python/morpheus/morpheus/controllers/file_to_df_controller.py index c8478c6ce1..e948a78dc4 100644 --- a/python/morpheus/morpheus/controllers/file_to_df_controller.py +++ b/python/morpheus/morpheus/controllers/file_to_df_controller.py @@ -24,8 +24,6 @@ import fsspec import pandas as pd -import cudf - from morpheus.common import FileTypes from morpheus.io.deserializers import read_file_to_df from morpheus.utils.column_info import DataFrameInputSchema @@ -130,7 +128,7 @@ def __init__(self, self._downloader = Downloader(download_method=download_method) def _get_or_create_dataframe_from_batch( - self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[cudf.DataFrame, bool]: + self, file_object_batch: typing.Tuple[fsspec.core.OpenFiles, int]) -> typing.Tuple[pd.DataFrame, bool]: if (not file_object_batch): raise RuntimeError("No file objects to process") @@ -209,7 +207,7 @@ def convert_to_dataframe(self, file_object_batch: typing.Tuple[fsspec.core.OpenF Returns ------- - cudf.DataFrame + pd.DataFrame The resulting DataFrame. """ diff --git a/python/morpheus/morpheus/controllers/filter_detections_controller.py b/python/morpheus/morpheus/controllers/filter_detections_controller.py index 10da7b7c9b..7737e061d7 100644 --- a/python/morpheus/morpheus/controllers/filter_detections_controller.py +++ b/python/morpheus/morpheus/controllers/filter_detections_controller.py @@ -15,11 +15,11 @@ import logging import typing -import cupy as cp import numpy as np from morpheus.common import FilterSource from morpheus.messages import ControlMessage +from morpheus.utils.type_aliases import NDArrayType logger = logging.getLogger(__name__) @@ -64,18 +64,13 @@ def field_name(self): """ return self._field_name - def _find_detections(self, msg: ControlMessage) -> typing.Union[cp.ndarray, np.ndarray]: + def _find_detections(self, msg: ControlMessage) -> NDArrayType: # Determine the filter source if self._filter_source == FilterSource.TENSOR: filter_source = msg.tensors().get_tensor(self._field_name) else: filter_source = msg.payload().get_data(self._field_name).values - if (isinstance(filter_source, np.ndarray)): - array_mod = np - else: - array_mod = cp - # Get per row detections detections = (filter_source > self._threshold) @@ -83,9 +78,9 @@ def _find_detections(self, msg: ControlMessage) -> typing.Union[cp.ndarray, np.n detections = detections.any(axis=1) # Surround in False to ensure we get an even number of pairs - detections = array_mod.concatenate([array_mod.array([False]), detections, array_mod.array([False])]) + detections = np.concatenate([np.array([False]), detections, np.array([False])]) - return array_mod.where(detections[1:] != detections[:-1])[0].reshape((-1, 2)) + return np.where(detections[1:] != detections[:-1])[0].reshape((-1, 2)) def filter_copy(self, msg: ControlMessage) -> ControlMessage: """ diff --git a/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py b/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py index 8bc1be6829..b7065130c7 100644 --- a/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py +++ b/python/morpheus/morpheus/controllers/mlflow_model_writer_controller.py @@ -33,10 +33,11 @@ from mlflow.types.utils import _infer_pandas_column as _mlflow_infer_pandas_column from mlflow.types.utils import _infer_schema -import cudf - from morpheus.messages import ControlMessage -from morpheus.models.dfencoder import AutoEncoder +from morpheus.utils.type_utils import is_cudf_type + +if typing.TYPE_CHECKING: + from morpheus.models.dfencoder import AutoEncoder logger = logging.getLogger(__name__) @@ -235,7 +236,7 @@ def on_data(self, message: ControlMessage) -> ControlMessage: user = message.get_metadata("user_id") - model: AutoEncoder = message.get_metadata("model") + model: "AutoEncoder" = message.get_metadata("model") model_path = "dfencoder" reg_model_name = self.user_id_to_model(user_id=user) @@ -283,7 +284,7 @@ def on_data(self, message: ControlMessage) -> ControlMessage: # prepare_df to show the actual inputs to the model (any extra are discarded) input_df = message.payload().get_data().iloc[0:1] - if isinstance(input_df, cudf.DataFrame): + if is_cudf_type(input_df): input_df = input_df.to_pandas() prepared_df = model.prepare_df(input_df) diff --git a/python/morpheus/morpheus/controllers/monitor_controller.py b/python/morpheus/morpheus/controllers/monitor_controller.py index 21916a3eb7..940d079097 100644 --- a/python/morpheus/morpheus/controllers/monitor_controller.py +++ b/python/morpheus/morpheus/controllers/monitor_controller.py @@ -19,12 +19,12 @@ import fsspec from tqdm import tqdm -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.utils.logger import LogLevels from morpheus.utils.monitor_utils import MorpheusTqdm +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_dataframe logger = logging.getLogger(__name__) @@ -57,6 +57,7 @@ class MonitorController: Custom implementation of tqdm if required. """ + SupportedTypes = typing.Union[DataFrameType, MessageMeta, ControlMessage, list] controller_count: int = 0 def __init__(self, @@ -125,20 +126,19 @@ def refresh_progress(self, _): """ self._progress.refresh() - def progress_sink(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, list]): + def progress_sink(self, msg: SupportedTypes) -> SupportedTypes: """ Receives a message and determines the count of the message. The progress bar is displayed and the progress is updated. Parameters ---------- - msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, typing.List] + msg: SupportedTypes Message that determines the count of the message Returns ------- - msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, list] - + SupportedTypes """ # Make sure the progress bar is shown @@ -158,14 +158,14 @@ def progress_sink(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMe return msg - def auto_count_fn(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, typing.List]): + def auto_count_fn(self, msg: SupportedTypes) -> typing.Callable[[SupportedTypes], int] | None: """ This is a helper function that is used to determine the count of messages received by the monitor. Parameters ---------- - msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMessage, typing.List] + msg: SupportedTypes Message that determines the count of the message Returns @@ -183,7 +183,7 @@ def auto_count_fn(self, msg: typing.Union[cudf.DataFrame, MessageMeta, ControlMe if (isinstance(msg, list) and len(msg) == 0): return None - if (isinstance(msg, cudf.DataFrame)): + if (is_dataframe(msg)): return lambda y: len(y.index) if (isinstance(msg, MessageMeta)): diff --git a/python/morpheus/morpheus/controllers/rss_controller.py b/python/morpheus/morpheus/controllers/rss_controller.py index c8d47d6696..6334f4f23c 100644 --- a/python/morpheus/morpheus/controllers/rss_controller.py +++ b/python/morpheus/morpheus/controllers/rss_controller.py @@ -27,9 +27,10 @@ import requests import requests_cache -import cudf - from morpheus.messages import MessageMeta +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_class logger = logging.getLogger(__name__) @@ -105,7 +106,8 @@ def __init__(self, strip_markup: bool = False, stop_after: int = 0, interval_secs: float = 600, - should_stop_fn: Callable[[], bool] = None): + should_stop_fn: Callable[[], bool] = None, + df_type: DataFrameModule = "cudf"): if IMPORT_EXCEPTION is not None: raise ImportError(IMPORT_ERROR_MESSAGE) from IMPORT_EXCEPTION @@ -141,6 +143,7 @@ def __init__(self, self._run_indefinitely = run_indefinitely self._interval_secs = interval_secs self._interval_td = timedelta(seconds=self._interval_secs) + self._df_class: type[DataFrameType] = get_df_class(df_type) self._enable_cache = enable_cache @@ -349,7 +352,7 @@ def fetch_dataframes(self): Yeilds ------ - cudf.DataFrame + DataFrameType A DataFrame containing feed entry data. Raises @@ -374,14 +377,14 @@ def fetch_dataframes(self): entry_accumulator.append(entry) if self._batch_size > 0 and len(entry_accumulator) >= self._batch_size: - yield cudf.DataFrame(entry_accumulator) + yield self._df_class(entry_accumulator) entry_accumulator.clear() self._previous_entries = current_entries # Yield any remaining entries. if entry_accumulator: - yield cudf.DataFrame(entry_accumulator) + yield self._df_class(entry_accumulator) else: logger.debug("No new entries found.") diff --git a/python/morpheus/morpheus/io/deserializers.py b/python/morpheus/morpheus/io/deserializers.py index 31499b4359..15867664f7 100644 --- a/python/morpheus/morpheus/io/deserializers.py +++ b/python/morpheus/morpheus/io/deserializers.py @@ -17,23 +17,38 @@ import io import typing -import pandas as pd - -import cudf +import numpy as np from morpheus.common import FileTypes from morpheus.common import determine_file_type from morpheus.common import read_file_to_df as read_file_to_df_cpp from morpheus.config import CppConfig from morpheus.io.utils import filter_null_data +from morpheus.io.utils import get_csv_reader +from morpheus.io.utils import get_json_reader +from morpheus.io.utils import get_parquet_reader +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType +def get_reader(file_type: FileTypes, df_type: DataFrameModule) -> typing.Callable[..., DataFrameType]: + if (file_type == FileTypes.CSV): + return get_csv_reader(df_type) + + if (file_type == FileTypes.JSON): + return get_json_reader(df_type) + + if (file_type == FileTypes.PARQUET): + return get_parquet_reader(df_type) + + raise ValueError(f"Unsupported file type: {file_type}") + + def _read_file_to_df_py(*, file_name: typing.Union[str, io.IOBase], file_type: FileTypes, parser_kwargs: dict, - df_type: typing.Literal["cudf", "pandas"]) -> DataFrameType: + df_type: DataFrameModule) -> DataFrameType: if (parser_kwargs is None): parser_kwargs = {} @@ -59,29 +74,15 @@ def _read_file_to_df_py(*, # Update with any args set by the user. User values overwrite defaults kwargs.update(parser_kwargs) + reader = get_reader(mode, df_type) - df_class = cudf if df_type == "cudf" else pd - - df = None - if (mode == FileTypes.JSON): - df = df_class.read_json(file_name, **kwargs) - - elif (mode == FileTypes.CSV): - df: pd.DataFrame = df_class.read_csv(file_name, **kwargs) - - if (len(df.columns) > 1 and df.columns[0] == "Unnamed: 0" and df.iloc[:, 0].dtype == cudf.dtype(int)): + df: DataFrameType = reader(file_name, **kwargs) + if (mode == FileTypes.CSV): + if (len(df.columns) > 1 and df.columns[0] == "Unnamed: 0" and df.iloc[:, 0].dtype == np.dtype(int)): df.set_index("Unnamed: 0", drop=True, inplace=True) df.index.name = "" df.sort_index(inplace=True) - elif (mode == FileTypes.PARQUET): - df = df_class.read_parquet(file_name, **kwargs) - - else: - assert False, f"Unsupported file type mode: {mode}" - - assert df is not None - return df @@ -90,7 +91,7 @@ def read_file_to_df(file_name: typing.Union[str, io.IOBase], parser_kwargs: dict = None, filter_nulls: bool = True, filter_null_columns: list[str] | str = 'data', - df_type: typing.Literal["cudf", "pandas"] = "pandas") -> DataFrameType: + df_type: DataFrameModule = "pandas") -> DataFrameType: """ Reads a file into a dataframe and performs any of the necessary cleanup. @@ -106,7 +107,7 @@ def read_file_to_df(file_name: typing.Union[str, io.IOBase], Whether to filter null rows after loading, by default True. filter_null_columns : list[str]|str, default = 'data' Column or columns to filter null values from. Ignored when `filter_null` is False. - df_type : typing.Literal[, optional + df_type : typing.Literal["cudf", "pandas"], optional What type of parser to use. Options are 'cudf' and 'pandas', by default "pandas". Returns diff --git a/python/morpheus/morpheus/io/serializers.py b/python/morpheus/morpheus/io/serializers.py index 90822ab6f7..b82b82e99c 100644 --- a/python/morpheus/morpheus/io/serializers.py +++ b/python/morpheus/morpheus/io/serializers.py @@ -19,13 +19,12 @@ from io import IOBase from io import StringIO -import cudf - from morpheus.common import FileTypes from morpheus.common import determine_file_type from morpheus.common import write_df_to_file as write_df_to_file_cpp from morpheus.config import CppConfig from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type def df_to_stream_csv(df: DataFrameType, stream: IOBase, include_header=False, include_index_col=True): @@ -203,10 +202,11 @@ def write_df_to_file(df: DataFrameType, file_name: str, file_type: FileTypes = F Additional arguments forwarded to the underlying serialization function. Where the underlying serialization function is one of `write_df_to_file_cpp`, `df_to_stream_csv`, or `df_to_stream_json`. """ - if (CppConfig.get_should_use_cpp() and isinstance(df, cudf.DataFrame)): - # Use the C++ implementation - write_df_to_file_cpp(df=df, filename=file_name, file_type=file_type, **kwargs) - return + if (CppConfig.get_should_use_cpp()): + if (is_cudf_type(df)): + # Use the C++ implementation + write_df_to_file_cpp(df=df, filename=file_name, file_type=file_type, **kwargs) + return mode = file_type diff --git a/python/morpheus/morpheus/io/utils.py b/python/morpheus/morpheus/io/utils.py index 9a20afb4d5..58e5413f91 100644 --- a/python/morpheus/morpheus/io/utils.py +++ b/python/morpheus/morpheus/io/utils.py @@ -14,14 +14,21 @@ # limitations under the License. """IO utilities.""" +import functools import logging +import typing import pandas as pd -import cudf - +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import df_type_str_to_exec_mode +from morpheus.utils.type_utils import is_cudf_type + +if typing.TYPE_CHECKING: + import cudf logger = logging.getLogger(__name__) @@ -44,7 +51,7 @@ def filter_null_data(x: DataFrameType, column_name: str = "data") -> DataFrameTy return x[~x[column_name].isna()] -def cudf_string_cols_exceed_max_bytes(df: cudf.DataFrame, column_max_bytes: dict[str, int]) -> bool: +def cudf_string_cols_exceed_max_bytes(df: "cudf.DataFrame", column_max_bytes: dict[str, int]) -> bool: """ Checks a cudf DataFrame for string columns that exceed a maximum number of bytes and thus need to be truncated by calling `truncate_string_cols_by_bytes`. @@ -64,6 +71,7 @@ def cudf_string_cols_exceed_max_bytes(df: cudf.DataFrame, column_max_bytes: dict bool True if truncation is needed, False otherwise. """ + import cudf if not isinstance(df, cudf.DataFrame): raise ValueError("Expected cudf DataFrame") @@ -101,7 +109,7 @@ def truncate_string_cols_by_bytes(df: DataFrameType, """ performed_truncation = False - is_cudf = isinstance(df, cudf.DataFrame) + is_cudf = is_cudf_type(df) for (col, max_bytes) in column_max_bytes.items(): series: SeriesType = df[col] @@ -124,8 +132,90 @@ def truncate_string_cols_by_bytes(df: DataFrameType, decoded_series = truncated_series.str.decode(encoding='utf-8', errors='ignore') if is_cudf: + import cudf df[col] = cudf.Series.from_pandas(decoded_series) else: df[col] = decoded_series return performed_truncation + + +def _selector_to_exec_mode(selector: DataFrameModule | ExecutionMode) -> ExecutionMode: + if not isinstance(selector, ExecutionMode): + execution_mode = df_type_str_to_exec_mode(selector) + else: + execution_mode = selector + + return execution_mode + + +def _get_df_method(selector: DataFrameModule | ExecutionMode, method_name: str) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate DataFrame method based on the execution mode. + """ + execution_mode = _selector_to_exec_mode(selector) + + if (execution_mode == ExecutionMode.GPU): + import cudf + method = getattr(cudf, method_name) + else: + method = getattr(pd, method_name) + + return method + + +@typing.overload +def get_csv_reader(selector: DataFrameModule) -> typing.Callable[..., DataFrameType]: + ... + + +@typing.overload +def get_csv_reader(selector: ExecutionMode) -> typing.Callable[..., DataFrameType]: + ... + + +def get_csv_reader(selector: DataFrameModule | ExecutionMode) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate CSV reader based on the execution mode. + """ + return _get_df_method(selector, 'read_csv') + + +@typing.overload +def get_json_reader(selector: DataFrameModule) -> typing.Callable[..., DataFrameType]: + ... + + +@typing.overload +def get_json_reader(selector: ExecutionMode) -> typing.Callable[..., DataFrameType]: + ... + + +def get_json_reader(selector: DataFrameModule | ExecutionMode) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate JSON reader based on the execution mode. + """ + execution_mode = _selector_to_exec_mode(selector) + reader = _get_df_method(execution_mode, 'read_json') + + if (execution_mode == ExecutionMode.GPU): + reader = functools.partial(reader, engine='cudf') + + return reader + + +@typing.overload +def get_parquet_reader(selector: DataFrameModule) -> typing.Callable[..., DataFrameType]: + ... + + +@typing.overload +def get_parquet_reader(selector: ExecutionMode) -> typing.Callable[..., DataFrameType]: + ... + + +def get_parquet_reader(selector: DataFrameModule | ExecutionMode) -> typing.Callable[..., DataFrameType]: + """ + Return the appropriate Parquet reader based on the execution mode. + """ + return _get_df_method(selector, 'read_parquet') diff --git a/python/morpheus/morpheus/messages/__init__.py b/python/morpheus/morpheus/messages/__init__.py index 867c41fefc..c6cb27c15c 100644 --- a/python/morpheus/morpheus/messages/__init__.py +++ b/python/morpheus/morpheus/messages/__init__.py @@ -18,7 +18,6 @@ # Import order is very important here. Import base classes before child ones # isort: off -from morpheus._lib.messages import ControlMessage from morpheus._lib.messages import DataLoaderRegistry from morpheus._lib.messages import RawPacketMessage from morpheus.messages.memory.tensor_memory import TensorMemory @@ -32,9 +31,12 @@ from morpheus.messages.message_base import MessageBase from morpheus.messages.message_meta import MessageMeta from morpheus.messages.message_meta import UserMessageMeta +from morpheus.messages.control_message import ControlMessageType +from morpheus.messages.control_message import ControlMessage __all__ = [ "ControlMessage", + "ControlMessageType", "DataLoaderRegistry", "InferenceMemory", "InferenceMemoryAE", diff --git a/python/morpheus/morpheus/messages/control_message.py b/python/morpheus/morpheus/messages/control_message.py new file mode 100644 index 0000000000..8c958572e8 --- /dev/null +++ b/python/morpheus/morpheus/messages/control_message.py @@ -0,0 +1,203 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pylint: disable=cyclic-import + +import dataclasses +import logging +import re +import typing +from collections import defaultdict +from collections import deque +from datetime import datetime + +# Users of this module should import ControlMessageType from morpheus.messages, we can't do that here without causing a +# circular import error, instead we import it from the _lib module, we don't want to put `_messages.ControlMessageType` +# in the public API and confuse users +import morpheus._lib.messages as _messages +from morpheus._lib.messages import ControlMessageType # pylint: disable=morpheus-incorrect-lib-from-import +from morpheus.cli.utils import get_enum_keys +from morpheus.cli.utils import get_enum_members +from morpheus.messages.memory.tensor_memory import TensorMemory +from morpheus.messages.message_base import MessageBase +from morpheus.messages.message_meta import MessageMeta + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass(init=False) +class ControlMessage(MessageBase, cpp_class=_messages.ControlMessage): + + def __init__(self, config_or_message: typing.Union["ControlMessage", dict] = None): + super().__init__() + + self._config: dict = {"metadata": {}} + + self._payload: MessageMeta = None + self._tensors: TensorMemory = None + + self._tasks: dict[str, deque] = defaultdict(deque) + self._timestamps: dict[str, datetime] = {} + self._type: ControlMessageType = ControlMessageType.NONE + + if isinstance(config_or_message, dict): + self.config(config_or_message) + elif isinstance(config_or_message, ControlMessage): + self._copy_impl(config_or_message, self) + elif config_or_message is not None: + raise ValueError(f"Invalid argument type {type(config_or_message)}, value must be a dict or ControlMessage") + + def copy(self) -> "ControlMessage": + return self._copy_impl(self) + + def config(self, config: dict = None) -> dict: + if config is not None: + cm_type: str | ControlMessageType = config.get("type") + if cm_type is not None: + if isinstance(cm_type, str): + try: + cm_type = get_enum_members(ControlMessageType)[cm_type] + except KeyError as exc: + enum_names = ", ".join(get_enum_keys(ControlMessageType)) + raise ValueError( + f"Invalid ControlMessageType: {cm_type}, supported types: {enum_names}") from exc + + self._type = cm_type + + tasks = config.get("tasks") + if tasks is not None: + for task in tasks: + self.add_task(task["type"], task["properties"]) + + self._config = {"metadata": config.get("metadata", {}).copy()} + + return self._config + + def has_task(self, task_type: str) -> bool: + """ + Return True if the control message has at least one task of the given type + """ + # Using `get` to avoid creating an empty list if the task type is not present + tasks = self._tasks.get(task_type, []) + return len(tasks) > 0 + + def add_task(self, task_type: str, task: dict): + if isinstance(task_type, str): + cm_type = get_enum_members(ControlMessageType).get(task_type, ControlMessageType.NONE) + if cm_type != ControlMessageType.NONE: + if self._type == ControlMessageType.NONE: + self._type = cm_type + elif self._type != cm_type: + raise ValueError("Cannot mix different types of tasks on the same control message") + + self._tasks[task_type].append(task) + + def remove_task(self, task_type: str) -> dict: + tasks = self._tasks.get(task_type, []) + if len(tasks) == 0: + raise ValueError(f"No task of type {task_type} found") + + return tasks.popleft() + + def get_tasks(self) -> dict[str, deque]: + return self._tasks + + def set_metadata(self, key: str, value: typing.Any): + self._config["metadata"][key] = value + + def has_metadata(self, key: str) -> bool: + return key in self._config["metadata"] + + def get_metadata(self, key: str = None, default_value: typing.Any = None) -> typing.Any: + """ + Return a given piece of metadata, if `key` is `None` return the entire metadata dictionary. + If `key` is not found, `default_value` is returned. + + :param key: The key of the metadata to retrieve, or None for all metadata + :param default_value: The value to return if the key is not found, ignored if `key` is None + :return: The value of the metadata key, or the entire metadata dictionary if `key` is None + """ + + # Not using `get` since `None` is a valid value + if key is None: + return self._config["metadata"] + + return self._config["metadata"].get(key, default_value) + + def list_metadata(self) -> list[str]: + return sorted(self._config["metadata"].keys()) + + def payload(self, payload: MessageMeta = None) -> MessageMeta | None: + if payload is not None: + self._payload = payload + + return self._payload + + def tensors(self, tensors: TensorMemory = None) -> TensorMemory | None: + if tensors is not None: + self._tensors = tensors + + return self._tensors + + def task_type(self, new_task_type: ControlMessageType = None) -> ControlMessageType: + if new_task_type is not None: + self._type = new_task_type + + return self._type + + def set_timestamp(self, key: str, timestamp: datetime): + self._timestamps[key] = timestamp + + def get_timestamp(self, key: str, fail_if_nonexist: bool = False) -> datetime | None: + try: + return self._timestamps[key] + except KeyError as e: + if fail_if_nonexist: + raise ValueError("Timestamp for the specified key does not exist.") from e + return None + + def filter_timestamp(self, regex_filter: str) -> dict[str, datetime]: + re_obj = re.compile(regex_filter) + + return {key: value for key, value in self._timestamps.items() if re_obj.match(key)} + + def _export_config(self) -> dict: + # Unfortunately there is no parity between the `config` object that the constructor accepts and the value + # returned by the `config` method. This method returns a config object that can be used to create a new instance + # with the same task type and tasks. + config = self.config().copy() + config["type"] = self.task_type().name + + tasks = [] + for (task_type, task_queue) in self.get_tasks().items(): + for task in task_queue: + tasks.append({"type": task_type, "properties": task}) + + config["tasks"] = tasks + + return config + + @classmethod + def _copy_impl(cls, src: "ControlMessage", dst: "ControlMessage" = None) -> "ControlMessage": + config = src._export_config() + + if dst is None: + dst = cls() + + dst.config(config) + dst.payload(src.payload()) + dst.tensors(src.tensors()) + dst._timestamps = src._timestamps.copy() + + return dst diff --git a/python/morpheus/morpheus/messages/memory/inference_memory.py b/python/morpheus/morpheus/messages/memory/inference_memory.py index 9bdc7b6503..6913515fe8 100644 --- a/python/morpheus/morpheus/messages/memory/inference_memory.py +++ b/python/morpheus/morpheus/messages/memory/inference_memory.py @@ -15,18 +15,17 @@ import dataclasses -import cupy as cp - import morpheus._lib.messages as _messages from morpheus.messages.data_class_prop import DataClassProp from morpheus.messages.memory.tensor_memory import TensorMemory +from morpheus.utils.type_aliases import NDArrayType @dataclasses.dataclass(init=False) class InferenceMemory(TensorMemory, cpp_class=_messages.InferenceMemory): """ This is a base container class for data that will be used for inference stages. This class is designed to - hold generic tensor data in cupy arrays. + hold generic tensor data in either CuPy or NumPy arrays. """ def get_input(self, name: str): @@ -40,7 +39,7 @@ def get_input(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Inputs corresponding to name. Raises @@ -50,7 +49,7 @@ def get_input(self, name: str): """ return self.get_tensor(name) - def set_input(self, name: str, tensor: cp.ndarray): + def set_input(self, name: str, tensor: NDArrayType): """ Update the input tensor identified by `name`. Alias for `InferenceMemory.set_tensor` @@ -58,8 +57,8 @@ def set_input(self, name: str, tensor: cp.ndarray): ---------- name : str Key used to do lookup in inputs dict of the container. - tensor : cupy.ndarray - Tensor as a CuPy array. + tensor : NDArrayType + Tensor as either CuPy or NumPy array. """ self.set_tensor(name, tensor) @@ -72,23 +71,23 @@ class InferenceMemoryNLP(InferenceMemory, cpp_class=_messages.InferenceMemoryNLP Parameters ---------- - input_ids : cupy.ndarray + input_ids : NDArrayType The token-ids for each string padded with 0s to max_length. - input_mask : cupy.ndarray + input_mask : NDArrayType The mask for token-ids result where corresponding positions identify valid token-id values. - seq_ids : cupy.ndarray + seq_ids : NDArrayType Ids used to index from an inference input to a message. Necessary since there can be more inference inputs than messages (i.e., if some messages get broken into multiple inference requests). """ - input_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) - input_mask: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, + input_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) - seq_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) + input_mask: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) + seq_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) - def __init__(self, *, count: int, input_ids: cp.ndarray, input_mask: cp.ndarray, seq_ids: cp.ndarray): + def __init__(self, *, count: int, input_ids: NDArrayType, input_mask: NDArrayType, seq_ids: NDArrayType): super().__init__(count=count, tensors={'input_ids': input_ids, 'input_mask': input_mask, 'seq_ids': seq_ids}) @@ -100,19 +99,19 @@ class InferenceMemoryFIL(InferenceMemory, cpp_class=_messages.InferenceMemoryFIL Parameters ---------- - input__0 : cupy.ndarray + input__0 : NDArrayType Inference input. - seq_ids : cupy.ndarray + seq_ids : NDArrayType Ids used to index from an inference input to a message. Necessary since there can be more inference inputs than messages (i.e., if some messages get broken into multiple inference requests). """ - input__0: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, + input__0: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) + seq_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) - seq_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) - def __init__(self, *, count: int, input__0: cp.ndarray, seq_ids: cp.ndarray): + def __init__(self, *, count: int, input__0: NDArrayType, seq_ids: NDArrayType): super().__init__(count=count, tensors={'input__0': input__0, 'seq_ids': seq_ids}) @@ -123,16 +122,16 @@ class InferenceMemoryAE(InferenceMemory, cpp_class=None): Parameters ---------- - inputs : cupy.ndarray + inputs : NDArrayType Inference input. - seq_ids : cupy.ndarray + seq_ids : NDArrayType Ids used to index from an inference input to a message. Necessary since there can be more inference inputs than messages (i.e., if some messages get broken into multiple inference requests). """ - input: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) - seq_ids: dataclasses.InitVar[cp.ndarray] = DataClassProp(InferenceMemory._get_tensor_prop, - InferenceMemory.set_input) + input: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, InferenceMemory.set_input) + seq_ids: dataclasses.InitVar[NDArrayType] = DataClassProp(InferenceMemory._get_tensor_prop, + InferenceMemory.set_input) - def __init__(self, *, count: int, inputs: cp.ndarray, seq_ids: cp.ndarray): + def __init__(self, *, count: int, inputs: NDArrayType, seq_ids: NDArrayType): super().__init__(count=count, tensors={'input': inputs, 'seq_ids': seq_ids}) diff --git a/python/morpheus/morpheus/messages/memory/response_memory.py b/python/morpheus/morpheus/messages/memory/response_memory.py index eb4318f928..bcf6a4c61b 100644 --- a/python/morpheus/morpheus/messages/memory/response_memory.py +++ b/python/morpheus/morpheus/messages/memory/response_memory.py @@ -16,12 +16,13 @@ import dataclasses import logging -import cupy as cp +import pandas as pd import morpheus._lib.messages as _messages from morpheus.messages.data_class_prop import DataClassProp from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.utils import logger as morpheus_logger +from morpheus.utils.type_aliases import NDArrayType logger = logging.getLogger(__name__) @@ -45,7 +46,7 @@ def get_output(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Tensors corresponding to name. Raises @@ -56,7 +57,7 @@ def get_output(self, name: str): """ return self.get_tensor(name) - def set_output(self, name: str, tensor: cp.ndarray): + def set_output(self, name: str, tensor: NDArrayType): """ Update the output tensor identified by `name`. Alias for `ResponseMemory.set_tensor` @@ -64,8 +65,8 @@ def set_output(self, name: str, tensor: cp.ndarray): ---------- name : str Key used to do lookup in tensors dict of the container. - tensor : cupy.ndarray - Tensor as a CuPy array. + tensor : NDArrayType + Tensor as either a CuPy or NumPy array. Raises ------ @@ -82,12 +83,12 @@ class ResponseMemoryProbs(ResponseMemory, cpp_class=_messages.ResponseMemoryProb Parameters ---------- - probs : cupy.ndarray + probs : NDArrayType Probabilities tensor """ - probs: dataclasses.InitVar[cp.ndarray] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) + probs: dataclasses.InitVar[NDArrayType] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) - def __init__(self, *, count: int, probs: cp.ndarray): + def __init__(self, *, count: int, probs: NDArrayType): super().__init__(count=count, tensors={'probs': probs}) @@ -98,7 +99,7 @@ class ResponseMemoryAE(ResponseMemory, cpp_class=None): Parameters ---------- - probs : cupy.ndarray + probs : NDArrayType Probabilities tensor user_id : str @@ -108,9 +109,9 @@ class ResponseMemoryAE(ResponseMemory, cpp_class=None): Explainability Dataframe, for each feature a column will exist with a name in the form of: `{feature}_z_loss` containing the loss z-score along with `max_abs_z` and `mean_abs_z` columns """ - probs: dataclasses.InitVar[cp.ndarray] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) - user_id = "" - explain_df = None + probs: dataclasses.InitVar[NDArrayType] = DataClassProp(ResponseMemory._get_tensor_prop, ResponseMemory.set_output) + user_id: str = "" + explain_df: pd.DataFrame = None - def __init__(self, *, count: int, probs: cp.ndarray): + def __init__(self, *, count: int, probs: NDArrayType): super().__init__(count=count, tensors={'probs': probs}) diff --git a/python/morpheus/morpheus/messages/memory/tensor_memory.py b/python/morpheus/morpheus/messages/memory/tensor_memory.py index 103240b15f..2e3164585e 100644 --- a/python/morpheus/morpheus/messages/memory/tensor_memory.py +++ b/python/morpheus/morpheus/messages/memory/tensor_memory.py @@ -16,30 +16,30 @@ import dataclasses import typing -import cupy as cp - import morpheus._lib.messages as _messages from morpheus.messages.message_base import MessageData +from morpheus.utils.type_aliases import NDArrayType +from morpheus.utils.type_aliases import TensorMapType @dataclasses.dataclass(init=False) class TensorMemory(MessageData, cpp_class=_messages.TensorMemory): """ This is a base container class for data that will be used for inference stages. This class is designed to - hold generic tensor data in cupy arrays. + hold generic tensor data in either CuPy or NumPy arrays. Parameters ---------- count : int Length of each tensor contained in `tensors`. - tensors : typing.Dict[str, cupy.ndarray] + tensors : TensorMapType Collection of tensors uniquely identified by a name. """ count: int - tensors: typing.Dict[str, cp.ndarray] = dataclasses.field(repr=False) + tensors: TensorMapType = dataclasses.field(repr=False) - def __init__(self, *, count: int = None, tensors: typing.Dict[str, cp.ndarray] = None): + def __init__(self, *, count: int = None, tensors: TensorMapType = None): self.count = count @@ -50,11 +50,11 @@ def __init__(self, *, count: int = None, tensors: typing.Dict[str, cp.ndarray] = self._tensors = tensors - def _check_tensors(self, tensors: typing.Dict[str, cp.ndarray]): + def _check_tensors(self, tensors: TensorMapType): for tensor in tensors.values(): self._check_tensor(tensor) - def _check_tensor(self, tensor: cp.ndarray): + def _check_tensor(self, tensor: NDArrayType): if (tensor.shape[0] != self.count): class_name = type(self).__name__ raise ValueError( @@ -96,18 +96,18 @@ def get_tensors(self): Returns ------- - typing.Dict[str, cp.ndarray] + TensorMapType """ return self._tensors - def set_tensors(self, tensors: typing.Dict[str, cp.ndarray]): + def set_tensors(self, tensors: TensorMapType): """ Overwrite the tensors stored by this instance. If the length of the tensors has changed, then the `count` property should also be updated. Parameters ---------- - tensors : typing.Dict[str, cupy.ndarray] + tensors : TensorMapType Collection of tensors uniquely identified by a name. """ self._check_tensors(tensors) @@ -124,7 +124,7 @@ def get_tensor(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Tensor. Raises @@ -145,7 +145,7 @@ def _get_tensor_prop(self, name: str): Returns ------- - cupy.ndarray + NDArrayType Tensor. Raises @@ -158,7 +158,7 @@ def _get_tensor_prop(self, name: str): except KeyError as e: raise AttributeError from e - def set_tensor(self, name: str, tensor: cp.ndarray): + def set_tensor(self, name: str, tensor: NDArrayType): """ Update the tensor identified by `name`. @@ -166,15 +166,13 @@ def set_tensor(self, name: str, tensor: cp.ndarray): ---------- name : str Tensor key name. - tensor : cupy.ndarray - Tensor as a CuPy array. + tensor : NDArrayType + Tensor as either a CuPy or NumPy array. Raises ------ ValueError If the number of rows in `tensor` does not match `count` """ - # Ensure that we have 2D array here (`ensure_2d` inserts the wrong axis) - reshaped_tensor = tensor if tensor.ndim == 2 else cp.reshape(tensor, (tensor.shape[0], -1)) - self._check_tensor(reshaped_tensor) - self._tensors[name] = reshaped_tensor + self._check_tensor(tensor) + self._tensors[name] = tensor diff --git a/python/morpheus/morpheus/messages/message_meta.py b/python/morpheus/morpheus/messages/message_meta.py index ecf542b553..4a3507fdf6 100644 --- a/python/morpheus/morpheus/messages/message_meta.py +++ b/python/morpheus/morpheus/messages/message_meta.py @@ -18,15 +18,14 @@ import typing import warnings -import cupy as cp import numpy as np import pandas as pd -import cudf - import morpheus._lib.messages as _messages from morpheus.messages.message_base import MessageBase +from morpheus.utils import logger as morpheus_logger from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType logger = logging.getLogger(__name__) @@ -49,7 +48,7 @@ class MutableTableCtxMgr: def __init__(self, meta) -> None: self.__dict__['__meta'] = meta - def __enter__(self) -> pd.DataFrame: + def __enter__(self) -> DataFrameType: meta = self.__dict__['__meta'] meta._mutex.acquire() return meta._df @@ -206,7 +205,7 @@ def get_meta_range(self, idx = self._df.index[mess_offset:mess_offset + message_count] - if (isinstance(idx, cudf.RangeIndex)): + if (isinstance(idx, pd.RangeIndex)): idx = slice(idx.start, idx.stop - 1, idx.step) if (columns is None): @@ -216,15 +215,15 @@ def get_meta_range(self, return self._df.loc[idx, columns] @typing.overload - def get_data(self) -> cudf.DataFrame: + def get_data(self) -> DataFrameType: ... @typing.overload - def get_data(self, columns: str) -> cudf.Series: + def get_data(self, columns: str) -> SeriesType: ... @typing.overload - def get_data(self, columns: typing.List[str]) -> cudf.DataFrame: + def get_data(self, columns: typing.List[str]) -> DataFrameType: ... def get_data(self, columns: typing.Union[None, str, typing.List[str]] = None): @@ -277,10 +276,6 @@ def set_data(self, columns: typing.Union[None, str, typing.List[str]], value): # First try to set the values on just our slice if the columns exist column_indexer = self._get_col_indexers(df, columns=columns) - # Check if the value is a cupy array and we have a pandas dataframe, convert to numpy - if (isinstance(value, cp.ndarray) and isinstance(df, pd.DataFrame)): - value = value.get() - # Check to see if we are adding a column. If so, we need to use df.loc instead of df.iloc if (-1 not in column_indexer): @@ -299,35 +294,8 @@ def set_data(self, columns: typing.Union[None, str, typing.List[str]], value): # Columns should never be empty if we get here assert columns is not None - # cudf is really bad at adding new columns - if (isinstance(df, cudf.DataFrame)): - - # TODO(morpheus#1487): This logic no longer works in CUDF 24.04. - # We should find a way to reinable the no-dropped-index path as - # that should be more performant than dropping the index. - # # saved_index = None - - # # # Check to see if we can use slices - # # if (not (df.index.is_unique and - # # (df.index.is_monotonic_increasing or df.index.is_monotonic_decreasing))): - # # # Save the index and reset - # # saved_index = df.index - # # df.reset_index(drop=True, inplace=True) - - # # # Perform the update via slices - # # df.loc[df.index[row_indexer], columns] = value - - # # # Reset the index if we changed it - # # if (saved_index is not None): - # # df.set_index(saved_index, inplace=True) - - saved_index = df.index - df.reset_index(drop=True, inplace=True) - df.loc[df.index[:], columns] = value - df.set_index(saved_index, inplace=True) - else: - # Now set the slice - df.loc[:, columns] = value + # Now set the slice + df.loc[:, columns] = value def get_slice(self, start, stop): """ @@ -350,12 +318,7 @@ def get_slice(self, start, stop): return MessageMeta(df.iloc[start:stop]) def _ranges_to_mask(self, df, ranges): - if isinstance(df, cudf.DataFrame): - zeros_fn = cp.zeros - else: - zeros_fn = np.zeros - - mask = zeros_fn(len(df), bool) + mask = np.zeros(len(df), bool) for range_ in ranges: mask[range_[0]:range_[1]] = True @@ -399,6 +362,8 @@ class UserMessageMeta(MessageMeta, cpp_class=None): user_id: str = dataclasses.field(init=False) def __init__(self, df: pd.DataFrame, user_id: str) -> None: + from morpheus.messages.control_message import ControlMessage + morpheus_logger.deprecated_message_warning(UserMessageMeta, ControlMessage) super().__init__(df) self.user_id = user_id @@ -418,5 +383,7 @@ class AppShieldMessageMeta(MessageMeta, cpp_class=None): source: str = dataclasses.field(init=False) def __init__(self, df: pd.DataFrame, source: str) -> None: + from morpheus.messages.control_message import ControlMessage + morpheus_logger.deprecated_message_warning(AppShieldMessageMeta, ControlMessage) super().__init__(df) self.source = source diff --git a/python/morpheus/morpheus/modules/filter_detections.py b/python/morpheus/morpheus/modules/filter_detections.py index 94ef301862..c0793a6092 100644 --- a/python/morpheus/morpheus/modules/filter_detections.py +++ b/python/morpheus/morpheus/modules/filter_detections.py @@ -81,7 +81,7 @@ def filter_detections(builder: mrc.Builder): field_name = config.get("field_name", "probs") threshold = config.get("threshold", 0.5) filter_source = config.get("filter_source", "AUTO") - use_cpp = config.get("use_cpp", False) + use_cpp = config.get("use_cpp", True) filter_source_dict = {"AUTO": FilterSource.Auto, "DATAFRAME": FilterSource.DATAFRAME, "TENSOR": FilterSource.TENSOR} diff --git a/python/morpheus/morpheus/modules/payload_batcher.py b/python/morpheus/morpheus/modules/payload_batcher.py index ca62a252bd..d3372e40e3 100644 --- a/python/morpheus/morpheus/modules/payload_batcher.py +++ b/python/morpheus/morpheus/modules/payload_batcher.py @@ -13,14 +13,11 @@ # limitations under the License. import logging -import typing import warnings import mrc from mrc.core import operators as ops -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.utils.control_message_utils import cm_default_failure_context_manager @@ -28,6 +25,9 @@ from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_ids import PAYLOAD_BATCHER from morpheus.utils.module_utils import register_module +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_pkg_from_obj +from morpheus.utils.type_utils import is_cudf_type logger = logging.getLogger(__name__) @@ -103,7 +103,7 @@ def payload_batcher(builder: mrc.Builder): @cm_skip_processing_if_failed @cm_default_failure_context_manager(raise_on_failure=raise_on_failure) - def on_next(control_message: ControlMessage) -> typing.List[ControlMessage]: + def on_next(control_message: ControlMessage) -> list[ControlMessage]: nonlocal disable_max_batch_size message_meta = control_message.payload() @@ -119,7 +119,7 @@ def on_next(control_message: ControlMessage) -> typing.List[ControlMessage]: return control_messages - def _batch_dataframe(df: cudf.DataFrame) -> typing.List[cudf.DataFrame]: + def _batch_dataframe(df: DataFrameType) -> list[DataFrameType]: nonlocal max_batch_size dfm_length = len(df) @@ -131,7 +131,7 @@ def _batch_dataframe(df: cudf.DataFrame) -> typing.List[cudf.DataFrame]: dfs = [df.iloc[i * max_batch_size:(i + 1) * max_batch_size] for i in range(num_batches)] return dfs - def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame]: + def _batch_dataframe_by_group(df: DataFrameType) -> list[DataFrameType]: nonlocal max_batch_size nonlocal group_by_columns nonlocal timestamp_column_name @@ -143,9 +143,14 @@ def _batch_dataframe_by_group(df: cudf.DataFrame) -> typing.List[cudf.DataFrame] if has_timestamp_column: # Apply timestamp pattern and group by the formatted timestamp column - df[period_column] = cudf.to_datetime(df[timestamp_column_name], format=timestamp_pattern) - # Period object conversion is not supported in cudf - df[period_column] = df[period_column].to_pandas().dt.to_period(period).astype('str') + df_pkg = get_df_pkg_from_obj(df) + period_series = df_pkg.to_datetime(df[timestamp_column_name], format=timestamp_pattern) + + if is_cudf_type(df): + # Period object conversion is not supported in cudf + period_series = period_series.to_pandas() + + df[period_column] = period_series.dt.to_period(period).astype('str') if len(group_by_columns) == 1: # Avoid warning from cudf regardning an upcoming change of behavior when applying a groupby to a single diff --git a/python/morpheus/morpheus/parsers/event_parser.py b/python/morpheus/morpheus/parsers/event_parser.py index a82785b48a..e2e23e5836 100644 --- a/python/morpheus/morpheus/parsers/event_parser.py +++ b/python/morpheus/morpheus/parsers/event_parser.py @@ -14,13 +14,14 @@ """Abstract class for all event log parsers.""" import logging -import typing from abc import ABC from abc import abstractmethod import yaml -import cudf +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj log = logging.getLogger(__name__) @@ -31,13 +32,13 @@ class EventParser(ABC): Parameters ---------- - columns: typing.Set[str] + columns: set[str] Event column names event_name: str Event name """ - def __init__(self, columns: typing.Set[str], event_name: str): + def __init__(self, columns: set[str], event_name: str): self._columns = columns self._event_name = event_name @@ -48,7 +49,7 @@ def columns(self): Returns ------- - typing.Set[str] + set[str] Event column names """ return self._columns @@ -66,7 +67,7 @@ def event_name(self): return self._event_name @abstractmethod - def parse(self, text: cudf.Series) -> cudf.Series: + def parse(self, text: SeriesType) -> SeriesType: """ Abstract method 'parse' triggers the parsing functionality. Subclasses are required to implement and execute any parsing pre-processing steps. @@ -74,25 +75,26 @@ def parse(self, text: cudf.Series) -> cudf.Series: log.info("Begin parsing of dataframe") pass - def parse_raw_event(self, text: cudf.Series, event_regex: typing.Dict[str, any]) -> cudf.DataFrame: + def parse_raw_event(self, text: SeriesType, event_regex: dict[str, str]) -> DataFrameType: """ Processes parsing of a specific type of raw event records received as a dataframe. Parameters ---------- - text : cudf.Series + text : SeriesType Raw event log text to be parsed. - event_regex: typing.Dict[str, any] + event_regex: typing.Dict[str, str] Required regular expressions for a given event type. Returns ------- - cudf.DataFrame + DataFrameType Parsed logs dataframe """ log.debug("Parsing raw events. Event type: %s", self.event_name) - parsed_gdf = cudf.DataFrame({col: [""] for col in self.columns}) + df_pkg = get_df_pkg_from_obj(text) + parsed_gdf = df_pkg.DataFrame({col: [""] for col in self.columns}) parsed_gdf = parsed_gdf[:0] event_specific_columns = event_regex.keys() # Applies regex pattern for each expected output column to raw data @@ -109,7 +111,7 @@ def parse_raw_event(self, text: cudf.Series, event_regex: typing.Dict[str, any]) return parsed_gdf - def _load_regex_yaml(self, yaml_file) -> typing.Dict[str, any]: + def _load_regex_yaml(self, yaml_file) -> dict[str, str]: """Returns a dictionary of event regexes contained in the given yaml file.""" with open(yaml_file, encoding='UTF-8') as yaml_file_h: regex_dict = yaml.safe_load(yaml_file_h) diff --git a/python/morpheus/morpheus/parsers/ip.py b/python/morpheus/morpheus/parsers/ip.py index 1fcb75ee81..a177f49082 100644 --- a/python/morpheus/morpheus/parsers/ip.py +++ b/python/morpheus/morpheus/parsers/ip.py @@ -12,24 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. +import ipaddress + import numpy as np +import pandas as pd -import cudf +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj +from morpheus.utils.type_utils import is_cudf_type -def ip_to_int(values): +def ip_to_int(values: SeriesType) -> SeriesType: """ Convert string column of IP addresses to integer values. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - values : cudf.Series + values : SeriesType IPv4 addresses to be converted Returns ------- - rtype : cudf.Series + rtype : SeriesType Integer representations of IP addresses Examples @@ -41,22 +46,26 @@ def ip_to_int(values): 1 167772161 dtype: uint32 """ - return values.str.ip2int() + if (is_cudf_type(values)): + return values.str.ip2int() + + # Pandas does not have an ip2int method + return values.apply(lambda x: int(ipaddress.IPv4Address(x))) -def int_to_ip(values): +def int_to_ip(values: SeriesType) -> SeriesType: """ Convert integer column to IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - values : cudf.Series + values : SeriesType uint32 representations of IP addresses Returns ------- - rtype : cudf.Series + rtype : SeriesType IPv4 addresses Examples @@ -68,22 +77,27 @@ def int_to_ip(values): 1 10.0.0.1 dtype: object """ - return cudf.Series._from_column(values._column.int2ip()) + if (is_cudf_type(values)): + import cudf + return cudf.Series._from_column(values._column.int2ip()) + + # Pandas does not have an int2ip method + return values.apply(lambda x: str(ipaddress.IPv4Address(x))) -def is_ip(ips: str): +def is_ip(ips: SeriesType) -> SeriesType: """ Indicates whether each address is an ip string. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -95,23 +109,26 @@ def is_ip(ips: str): 1 False dtype: bool """ + if (is_cudf_type(ips)): + return ips.str.isipv4() + is_ip_regex = r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" - return ips.str.match(is_ip_regex) + return ips.str.fullmatch(is_ip_regex) -def is_reserved(ips): +def is_reserved(ips: SeriesType) -> SeriesType: """ Indicates whether each address is reserved. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -129,19 +146,19 @@ def is_reserved(ips): return ips.str.match(reserved_ipv4_regex) -def is_loopback(ips): +def is_loopback(ips: SeriesType) -> SeriesType: """ Indicates whether each address is loopback. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -159,19 +176,19 @@ def is_loopback(ips): return ips.str.match(loopback_ipv4_regex) -def is_link_local(ips): +def is_link_local(ips: SeriesType) -> SeriesType: """ Indicates whether each address is link local. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -189,19 +206,19 @@ def is_link_local(ips): return ips.str.match(link_local_ipv4_regex) -def is_unspecified(ips): +def is_unspecified(ips: SeriesType) -> SeriesType: """ Indicates whether each address is unspecified. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -217,19 +234,19 @@ def is_unspecified(ips): return ips.str.match(unspecified_regex) -def is_multicast(ips): +def is_multicast(ips: SeriesType) -> SeriesType: """ Indicates whether each address is multicast. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -247,19 +264,19 @@ def is_multicast(ips): return ips.str.match(is_multicast_ipv4_regex) -def is_private(ips): +def is_private(ips: SeriesType) -> SeriesType: """ Indicates whether each address is private. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -290,19 +307,19 @@ def is_private(ips): return ips.str.match(private_regex) -def is_global(ips): +def is_global(ips: SeriesType) -> SeriesType: """ Indicates whether each address is global. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked Returns ------- - rtype : cudf.Series + rtype : SeriesType Boolean values true or false Examples @@ -323,7 +340,7 @@ def is_global(ips): return result -def _netmask_kernel(idx, out1, out2, out3, out4, kwarg1): +def _mask_kernel(idx, out1, out2, out3, out4, kwarg1): for i, _ in enumerate(idx): out1[i] = int(kwarg1 / 16777216) % 256 out2[i] = int(kwarg1 / 65536) % 256 @@ -331,21 +348,52 @@ def _netmask_kernel(idx, out1, out2, out3, out4, kwarg1): out4[i] = int(kwarg1) % 256 -def netmask(ips, prefixlen=16): +def _mask_pandas(df_cols: tuple[int], mask_: int, series_name: str) -> pd.Series: + outputs = [int(mask_ / 16777216) % 256, int(mask_ / 65536) % 256, int(mask_ / 256) % 256, int(mask_) % 256] + return pd.Series([df_cols.idx, ".".join(map(str, outputs))], index=["idx", series_name]) + + +def _compute_mask_impl(ips: SeriesType, mask_: int, series_name: str) -> SeriesType: + df_pkg = get_df_pkg_from_obj(ips) + if is_cudf_type(ips): + df = df_pkg.DataFrame() + df["idx"] = ips.index + x = df.apply_rows( + _mask_kernel, + incols=["idx"], + outcols={ + "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 + }, + kwargs={"kwarg1": mask_}, + ) + + out1 = x["out1"].astype(str) + out2 = x["out2"].astype(str) + out3 = x["out3"].astype(str) + out4 = x["out4"].astype(str) + df[series_name] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") + else: + df = df_pkg.DataFrame({"idx": ips.index}) + df = df.apply(_mask_pandas, axis=1, args=(mask_, series_name)) + + return df[series_name] + + +def netmask(ips: SeriesType, prefixlen: int = 16) -> SeriesType: """ Compute a column of netmasks for a column of IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked prefixlen: int Length of the network prefix, in bits, for IPv4 addresses Returns ------- - rtype : cudf.Series + rtype : SeriesType Netmask ouput from set of IP address @@ -360,48 +408,24 @@ def netmask(ips, prefixlen=16): """ all_ones = (2**32) - 1 mask_int = all_ones ^ (all_ones >> prefixlen) - df = cudf.DataFrame() - df["idx"] = ips.index - x = df.apply_rows( - _netmask_kernel, - incols=["idx"], - outcols={ - "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 - }, - kwargs={"kwarg1": mask_int}, - ) - - out1 = x["out1"].astype(str) - out2 = x["out2"].astype(str) - out3 = x["out3"].astype(str) - out4 = x["out4"].astype(str) - df["net_mask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") - return df["net_mask"] - - -def _hostmask_kernel(idx, out1, out2, out3, out4, kwarg1): - for i, _ in enumerate(idx): - out1[i] = int(kwarg1 / 16777216) % 256 - out2[i] = int(kwarg1 / 65536) % 256 - out3[i] = int(kwarg1 / 256) % 256 - out4[i] = int(kwarg1) % 256 + return _compute_mask_impl(ips, mask_int, "net_mask") -def hostmask(ips, prefixlen=16): +def hostmask(ips: SeriesType, prefixlen: int = 16) -> SeriesType: """ Compute a column of hostmasks for a column of IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked prefixlen: integer Length of the network prefix, in bits, for IPv4 addresses Returns ------- - rtype : cudf.Series + rtype : SeriesType Hostmask ouput from set of IP address Examples @@ -415,24 +439,10 @@ def hostmask(ips, prefixlen=16): """ all_ones = (2**32) - 1 host_mask_int = int(all_ones ^ (all_ones >> prefixlen)) ^ all_ones - df = cudf.DataFrame() - df["idx"] = ips.index - x = df.apply_rows(_hostmask_kernel, - incols=["idx"], - outcols={ - "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 - }, - kwargs={"kwarg1": host_mask_int}) - - out1 = x["out1"].astype(str) - out2 = x["out2"].astype(str) - out3 = x["out3"].astype(str) - out4 = x["out4"].astype(str) - df["hostmask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") - return df["hostmask"] - - -def _mask_kernel(masked_ip_int, out1, out2, out3, out4, kwarg1): # pylint: disable=unused-argument + return _compute_mask_impl(ips, host_mask_int, "hostmask") + + +def _mask_series_kernel(masked_ip_int, out1, out2, out3, out4, kwarg1): # pylint: disable=unused-argument for i, ipnum in enumerate(masked_ip_int): out1[i] = int(ipnum / 16777216) % 256 out2[i] = int(ipnum / 65536) % 256 @@ -440,21 +450,25 @@ def _mask_kernel(masked_ip_int, out1, out2, out3, out4, kwarg1): # pylint: disa out4[i] = int(ipnum) % 256 -def mask(ips, masks): +def _mask_series_pandas(df_cols: tuple[int], mask_series_name: str, output_series_name: str) -> pd.Series: + return _mask_pandas(df_cols, df_cols[mask_series_name], output_series_name) + + +def mask(ips: SeriesType, masks: SeriesType) -> SeriesType: """ Apply a mask to a column of IP addresses. **Addresses must be IPv4. IPv6 not yet supported.** Parameters ---------- - ips : cudf.Series + ips : SeriesType IPv4 addresses to be checked - masks: cudf.Series + masks: SeriesType The host or subnet masks to be applied Returns ------- - rtype : cudf.Series + rtype : SeriesType Masked IP address from list of IPs Examples @@ -468,21 +482,28 @@ def mask(ips, masks): 1 10.0.0.0 Name: mask, dtype: object """ - df = cudf.DataFrame() - df["int_mask"] = masks.str.ip2int() - df["int_ip"] = ips.str.ip2int() + df_pkg = get_df_pkg_from_obj(ips) + + df = df_pkg.DataFrame() + df["int_mask"] = ip_to_int(masks) + df["int_ip"] = ip_to_int(ips) df["masked_ip_int"] = df["int_mask"] & df["int_ip"] - x = df.apply_rows(_mask_kernel, - incols=["masked_ip_int"], - outcols={ - "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 - }, - kwargs={"kwarg1": 0}) - - out1 = x["out1"].astype(str) - out2 = x["out2"].astype(str) - out3 = x["out3"].astype(str) - out4 = x["out4"].astype(str) - df["mask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") + if (is_cudf_type(df)): + x = df.apply_rows(_mask_series_kernel, + incols=["masked_ip_int"], + outcols={ + "out1": np.int64, "out2": np.int64, "out3": np.int64, "out4": np.int64 + }, + kwargs={"kwarg1": 0}) + + out1 = x["out1"].astype(str) + out2 = x["out2"].astype(str) + out3 = x["out3"].astype(str) + out4 = x["out4"].astype(str) + df["mask"] = out1.str.cat(out2, sep=".").str.cat(out3, sep=".").str.cat(out4, sep=".") + else: + df["idx"] = ips.index + df = df.apply(_mask_series_pandas, axis=1, args=("masked_ip_int", "mask")) + return df["mask"] diff --git a/python/morpheus/morpheus/parsers/url_parser.py b/python/morpheus/morpheus/parsers/url_parser.py index bf3077a601..88a8b56ddf 100644 --- a/python/morpheus/morpheus/parsers/url_parser.py +++ b/python/morpheus/morpheus/parsers/url_parser.py @@ -13,21 +13,15 @@ # limitations under the License. import os - -import cudf +import types import morpheus +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj +from morpheus.utils.type_utils import is_cudf_type - -def _load_suffix_file(): - suffix_list_path = os.path.join(morpheus.DATA_DIR, "public_suffix_list.dat") - # Read suffix list csv file - suffix_df = cudf.io.csv.read_csv(suffix_list_path, names=["suffix"], header=None, dtype=["str"]) - suffix_df = suffix_df[suffix_df["suffix"].str.contains("^[^//]+$")] - return suffix_df - - -_SUFFIX_DF = _load_suffix_file() +_SUFFIX_DF_CACHE = {} _ALLOWED_OUTPUT_COLS = { "hostname", "subdomain", @@ -36,7 +30,24 @@ def _load_suffix_file(): } -def _handle_unknown_suffix(unknown_suffix_df, col_dict): +def _get_suffix_df(df_pkg: types.ModuleType) -> DataFrameType: + suffix_df = _SUFFIX_DF_CACHE.get(df_pkg) + if suffix_df is None: + suffix_list_path = os.path.join(morpheus.DATA_DIR, "public_suffix_list.dat") + # Read suffix list csv file, ignore comments and empty lines. + suffix_df = df_pkg.read_csv(suffix_list_path, + names=["suffix"], + header=None, + dtype={'suffix': "str"}, + comment='/', + skip_blank_lines=True) + suffix_df = suffix_df[suffix_df["suffix"].str.contains("^[^//]+$")] + _SUFFIX_DF_CACHE[df_pkg] = suffix_df + + return suffix_df + + +def _handle_unknown_suffix(unknown_suffix_df: DataFrameType, col_dict: dict[str, bool]) -> DataFrameType: if col_dict["hostname"]: unknown_suffix_df = unknown_suffix_df[["idx", "tld0"]] unknown_suffix_df = unknown_suffix_df.rename(columns={"tld0": "hostname"}) @@ -53,7 +64,8 @@ def _handle_unknown_suffix(unknown_suffix_df, col_dict): return unknown_suffix_df -def _extract_tld(input_df, suffix_df, col_len, col_dict): +def _extract_tld(input_df: DataFrameType, suffix_df: DataFrameType, col_len: int, + col_dict: dict[str, bool]) -> DataFrameType: tmp_dfs = [] # Left join on single column dataframe does not provide expected results hence adding dummy column. suffix_df["dummy"] = "" @@ -109,12 +121,14 @@ def _extract_tld(input_df, suffix_df, col_len, col_dict): tmp_dfs.append(unknown_suffix_df) else: continue + # Concat all temporary output dataframes - output_df = cudf.concat(tmp_dfs) + df_pkg = get_df_pkg_from_obj(input_df) + output_df = df_pkg.concat(tmp_dfs) return output_df -def _create_col_dict(allowed_output_cols, req_cols): +def _create_col_dict(allowed_output_cols: set[str], req_cols: set[str]) -> dict[str, bool]: """Creates dictionary to apply check condition while extracting tld. """ col_dict = {col: True for col in allowed_output_cols} @@ -124,7 +138,7 @@ def _create_col_dict(allowed_output_cols, req_cols): return col_dict -def _verify_req_cols(req_cols, allowed_output_cols): +def _verify_req_cols(req_cols: set[str], allowed_output_cols: set[str]) -> set[str]: """Verify user requested columns against allowed output columns. """ if req_cols is not None: @@ -135,7 +149,7 @@ def _verify_req_cols(req_cols, allowed_output_cols): return req_cols -def _generate_tld_cols(hostname_split_df, hostnames, col_len): +def _generate_tld_cols(hostname_split_df: DataFrameType, hostnames: SeriesType, col_len: int) -> DataFrameType: hostname_split_df = hostname_split_df.fillna("") hostname_split_df["tld" + str(col_len)] = hostname_split_df[col_len] # Add all other elements of hostname_split_df @@ -147,25 +161,25 @@ def _generate_tld_cols(hostname_split_df, hostnames, col_len): return hostname_split_df -def _extract_hostnames(urls): +def _extract_hostnames(urls: SeriesType) -> SeriesType: hostnames = urls.str.extract("([\\w]+[\\.].*[^/]|[\\-\\w]+[\\.].*[^/])")[0].str.extract("([\\w\\.\\-]+)")[0] return hostnames -def parse(urls, req_cols=None): +def parse(urls: SeriesType, req_cols: set[str] = None) -> DataFrameType: """ Extract hostname, domain, subdomain and suffix from URLs. Parameters ---------- - urls : cudf.Series + urls : SeriesType URLs to be parsed. req_cols : typing.Set[str] Selected columns to extract. Can be subset of (hostname, domain, subdomain and suffix). Returns ------- - cudf.DataFrame + DataFrameType Parsed dataframe with selected columns to extract. Examples @@ -196,6 +210,7 @@ def parse(urls, req_cols=None): 2 github com 3 pydata org """ + df_pkg = get_df_pkg_from_obj(urls) req_cols = _verify_req_cols(req_cols, _ALLOWED_OUTPUT_COLS) col_dict = _create_col_dict(req_cols, _ALLOWED_OUTPUT_COLS) hostnames = _extract_hostnames(urls) @@ -203,14 +218,21 @@ def parse(urls, req_cols=None): del urls hostname_split_ser = hostnames.str.findall("([^.]+)") hostname_split_df = hostname_split_ser.to_frame() - hostname_split_df = cudf.DataFrame(hostname_split_df[0].to_arrow().to_pylist()) + + if is_cudf_type(hostname_split_df): + hostname_split_df = df_pkg.DataFrame(hostname_split_df[0].to_arrow().to_pylist()) + else: + hostname_split_df = df_pkg.DataFrame(hostname_split_df[0].to_list()) + col_len = len(hostname_split_df.columns) - 1 hostname_split_df = _generate_tld_cols(hostname_split_df, hostnames, col_len) # remove hostnames since they are available in hostname_split_df del hostnames # Assign input index to idx column. hostname_split_df["idx"] = url_index - output_df = _extract_tld(hostname_split_df, _SUFFIX_DF, col_len, col_dict) + + suffix_df = _get_suffix_df(df_pkg) + output_df = _extract_tld(hostname_split_df, suffix_df, col_len, col_dict) # Sort index based on given input index order. output_df = output_df.sort_values("idx", ascending=True) # Drop temp columns. diff --git a/python/morpheus/morpheus/parsers/windows_event_parser.py b/python/morpheus/morpheus/parsers/windows_event_parser.py index 475c4a405d..8b62c2cf0e 100644 --- a/python/morpheus/morpheus/parsers/windows_event_parser.py +++ b/python/morpheus/morpheus/parsers/windows_event_parser.py @@ -16,10 +16,11 @@ import os import typing -import cudf - import morpheus from morpheus.parsers.event_parser import EventParser +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType +from morpheus.utils.type_utils import get_df_pkg_from_obj log = logging.getLogger(__name__) @@ -41,17 +42,17 @@ def __init__(self, interested_eventcodes=None): self._event_regex = self._load_regex_yaml(regex_filepath) EventParser.__init__(self, self.get_columns(), self.EVENT_NAME) - def parse(self, text: cudf.Series) -> cudf.Series: + def parse(self, text: SeriesType) -> DataFrameType: """Parses the Windows raw event. Parameters ---------- - text : cudf.Series + text : SeriesType Raw event log text to be parsed Returns ------- - cudf.DataFrame + DataFrameType Parsed logs dataframe """ # Clean raw data to be consistent. @@ -65,23 +66,25 @@ def parse(self, text: cudf.Series) -> cudf.Series: temp = self.parse_raw_event(input_chunk, self._event_regex[eventcode]) if not temp.empty: output_chunks.append(temp) - parsed_dataframe = cudf.concat(output_chunks) + + df_pkg = get_df_pkg_from_obj(text) + parsed_dataframe = df_pkg.concat(output_chunks) # Replace null values with empty. parsed_dataframe = parsed_dataframe.fillna("") return parsed_dataframe - def clean_raw_data(self, text: cudf.Series) -> cudf.Series: + def clean_raw_data(self, text: SeriesType) -> SeriesType: """ Lower casing and replacing escape characters. Parameters ---------- - text : cudf.Series + text : SeriesType Raw event log text to be clean Returns ------- - cudf.Series + SeriesType Clean raw event log text """ text = (text.str.lower().str.replace("\\\\t", "").str.replace("\\\\r", "").str.replace("\\\\n", "|")) diff --git a/python/morpheus/morpheus/parsers/zeek.py b/python/morpheus/morpheus/parsers/zeek.py index 44ef464e1f..bc8d5683b7 100644 --- a/python/morpheus/morpheus/parsers/zeek.py +++ b/python/morpheus/morpheus/parsers/zeek.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cudf +from morpheus.io.utils import get_csv_reader +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType TYPE_DICT = { "bool": "bool", @@ -36,7 +38,7 @@ } -def parse(filepath: str) -> cudf.DataFrame: +def parse(filepath: str, df_type: DataFrameModule = "cudf") -> DataFrameType: """ Parse Zeek log file and return cuDF dataframe. Uses header comments to get column names/types and configure parser. @@ -45,20 +47,23 @@ def parse(filepath: str) -> cudf.DataFrame: ---------- filepath : str File path of Zeek log file + df_type : DataFrameTypeStr, default 'cudf' + Type of dataframe to return. Either 'cudf' or 'pandas' Returns ------- - cudf.DataFrame + DataFrameType Parsed Zeek log dataframe """ - header_gdf = cudf.read_csv(filepath, names=["line"], nrows=8) + csv_reader = get_csv_reader(df_type) + header_gdf = csv_reader(filepath, names=["line"], nrows=8) lines_gdf = header_gdf["line"].str.split() column_names = lines_gdf.iloc[6][1:] column_types = lines_gdf.iloc[7][1:] column_dtypes = list(map(lambda x: TYPE_DICT.get(x, "str"), column_types)) - log_gdf = cudf.read_csv( + log_gdf = csv_reader( filepath, delimiter="\t", dtype=column_dtypes, diff --git a/python/morpheus/morpheus/pipeline/execution_mode_mixins.py b/python/morpheus/morpheus/pipeline/execution_mode_mixins.py new file mode 100644 index 0000000000..e5483501fa --- /dev/null +++ b/python/morpheus/morpheus/pipeline/execution_mode_mixins.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Mixins to indicate which execution modes are supported for a given stage. These mixins should be used for any stage +that needs to support execution modes other than the default GPU mode, and the supported execution modes do not change +based upon configuration or runtime conditions. +""" + +import types +from abc import ABC + +from morpheus.config import ExecutionMode +from morpheus.utils import type_utils +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType + + +class CpuOnlyMixin(ABC): + """ + Mixin intented to be added to stages which support only CPU execution modes. + """ + + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.CPU, ) + + +class GpuAndCpuMixin(ABC): + """ + Mixin intented to be added to stages which support both GPU and CPU execution modes. + """ + + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.GPU, ExecutionMode.CPU) + + @property + def df_type_str(self) -> DataFrameModule: + """ + Returns the DataFrame module that should be used for the given execution mode. + """ + return type_utils.exec_mode_to_df_type_str(self._config.execution_mode) + + def get_df_pkg(self) -> types.ModuleType: + """ + Returns the DataFrame package that should be used for the given execution mode. + """ + return type_utils.get_df_pkg(self._config.execution_mode) + + def get_df_class(self) -> type[DataFrameType]: + """ + Returns the DataFrame class that should be used for the given execution mode. + """ + return type_utils.get_df_class(self._config.execution_mode) diff --git a/python/morpheus/morpheus/pipeline/linear_pipeline.py b/python/morpheus/morpheus/pipeline/linear_pipeline.py index 7b8a4a767c..add998fa4c 100644 --- a/python/morpheus/morpheus/pipeline/linear_pipeline.py +++ b/python/morpheus/morpheus/pipeline/linear_pipeline.py @@ -148,6 +148,7 @@ def add_segment_boundary(self, data_type=None, as_shared_pointer=False): raise RuntimeError("Cannot create a segment boundary, current segment is empty.") empty_config = Config() + empty_config.execution_mode = self._execution_mode boundary_egress = LinearBoundaryEgressStage(empty_config, boundary_port_id=self._current_segment_id, data_type=data_type) diff --git a/python/morpheus/morpheus/pipeline/pipeline.py b/python/morpheus/morpheus/pipeline/pipeline.py index 6f719e4d54..1298f6f020 100644 --- a/python/morpheus/morpheus/pipeline/pipeline.py +++ b/python/morpheus/morpheus/pipeline/pipeline.py @@ -60,6 +60,7 @@ class Pipeline(): """ def __init__(self, config: Config): + config.freeze() self._mutex = threading.RLock() @@ -91,6 +92,8 @@ def __init__(self, config: Config): # Future that allows post_start to propagate exceptions back to pipeline self._post_start_future: asyncio.Future = None + self._execution_mode = config.execution_mode + @property def state(self) -> PipelineState: return self._state diff --git a/python/morpheus/morpheus/pipeline/preallocator_mixin.py b/python/morpheus/morpheus/pipeline/preallocator_mixin.py index e50b4e2070..a06f7e1532 100644 --- a/python/morpheus/morpheus/pipeline/preallocator_mixin.py +++ b/python/morpheus/morpheus/pipeline/preallocator_mixin.py @@ -17,18 +17,16 @@ from abc import ABC from collections import OrderedDict -import cupy as cp import mrc import numpy as np import pandas as pd from mrc.core import operators as ops -import cudf - from morpheus.common import TypeId from morpheus.common import typeid_is_fully_supported from morpheus.common import typeid_to_numpy_str from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.utils.type_aliases import DataFrameType @@ -40,7 +38,7 @@ class PreallocatorMixin(ABC): """ Mixin intented to be added to stages, typically source stages, which are emitting newly constructed DataFrame or - MessageMeta instances into the segment. During segment build, if the `_needed_columns` addtribut is not empty an + MessageMeta instances into the segment. During segment build, if the `_needed_columns` addtribute is not empty an additional node will be inserted into the graph after the derived class' node which will perform the allocation. The exceptions would be non-source stages like DFP's `DFPFileToDataFrameStage` which are not sources but are @@ -59,7 +57,9 @@ def set_needed_columns(self, needed_columns: OrderedDict): def _preallocate_df(self, df: DataFrameType) -> DataFrameType: missing_columns = [col for col in self._needed_columns.keys() if col not in df.columns] if len(missing_columns) > 0: - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): + # assume cudf.DataFrame + import cupy as cp alloc_func = cp.zeros else: alloc_func = np.zeros @@ -118,12 +118,19 @@ def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) node = builder.make_node(node_name, ops.map(self._preallocate_meta)) else: raise RuntimeError(f"Unsupported output type {pretty_type}") - elif issubclass(out_type, (cudf.DataFrame, pd.DataFrame)): - node = builder.make_node(node_name, ops.map(self._preallocate_df)) + else: - msg = ("Additional columns were requested to be inserted into the Dataframe, but the output type " - f"{pretty_type} isn't a supported type") - raise RuntimeError(msg) + supported_df_types = [pd.DataFrame] + if self._config.execution_mode == ExecutionMode.GPU: + import cudf + supported_df_types.append(cudf.DataFrame) + + if issubclass(out_type, tuple(supported_df_types)): + node = builder.make_node(node_name, ops.map(self._preallocate_df)) + else: + msg = ("Additional columns were requested to be inserted into the Dataframe, but the output type " + f"{pretty_type} isn't a supported type") + raise RuntimeError(msg) builder.make_edge(out_node, node) out_node = node diff --git a/python/morpheus/morpheus/pipeline/single_port_stage.py b/python/morpheus/morpheus/pipeline/single_port_stage.py index b9ea20aeeb..49687afc96 100644 --- a/python/morpheus/morpheus/pipeline/single_port_stage.py +++ b/python/morpheus/morpheus/pipeline/single_port_stage.py @@ -83,7 +83,6 @@ def _build(self, builder: mrc.Builder, input_nodes: list[mrc.SegmentObject]) -> def _post_build_single(self, _: mrc.Builder, out_node: mrc.SegmentObject) -> mrc.SegmentObject: return out_node - @typing.final def _post_build(self, builder: mrc.Builder, out_ports_nodes: list[mrc.SegmentObject]) -> list[mrc.SegmentObject]: ret_val = self._post_build_single(builder, out_ports_nodes[0]) diff --git a/python/morpheus/morpheus/pipeline/stage_base.py b/python/morpheus/morpheus/pipeline/stage_base.py index ebae6cbfef..ebb5541166 100644 --- a/python/morpheus/morpheus/pipeline/stage_base.py +++ b/python/morpheus/morpheus/pipeline/stage_base.py @@ -27,6 +27,7 @@ import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.config import Config from morpheus.config import CppConfig +from morpheus.config import ExecutionMode from morpheus.utils.atomic_integer import AtomicInteger from morpheus.utils.type_utils import _DecoratorType @@ -84,6 +85,7 @@ class StageBase(ABC, collections.abc.Hashable): def __init__(self, config: Config): # Save the config + config.freeze() self._config = config self._id = StageBase.__ID_COUNTER.get_and_inc() @@ -285,6 +287,19 @@ def supports_cpp_node(self) -> bool: # return False pass + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. By default this returns `(ExecutionMode.GPU,)`. + Subclasses can override this method to specify different execution modes. + + For most stages the values will be static, and this can be accomplished by making use of either the + `CpuOnlyMixin` or `GpuAndCpuMixin` mixins. + + However, complex stages may choose to make this decision at runtime, in which case this method should be + overridden. directly within the stage class. + """ + return (ExecutionMode.GPU, ) + def _build_cpp_node(self): """ Specifies whether to build a C++ node. Only should be called during the build phase. @@ -347,6 +362,14 @@ def can_build(self, check_ports=False) -> bool: def _pre_build(self, do_propagate: bool = True): assert not self.is_built, "build called prior to _pre_build" assert not self.is_pre_built, "Can only pre-build stages once!" + + # Check the execution mode + if (self._config.execution_mode not in self.supported_execution_modes()): + supported_modes = ", ".join(str(x) for x in self.supported_execution_modes()) + raise RuntimeError(f"Unsupported execution mode {self._config.execution_mode} for stage {self.name}, " + f"supported exexution modes are {supported_modes}") + + # Perform schema validation schema = _pipeline.StageSchema(self) self._pre_compute_schema(schema) self.compute_schema(schema) diff --git a/python/morpheus/morpheus/pipeline/stage_decorator.py b/python/morpheus/morpheus/pipeline/stage_decorator.py index bede41b3e2..fb2412a325 100644 --- a/python/morpheus/morpheus/pipeline/stage_decorator.py +++ b/python/morpheus/morpheus/pipeline/stage_decorator.py @@ -22,11 +22,10 @@ import pandas as pd from mrc.core import operators as ops -import cudf - import morpheus.pipeline as _pipeline # pylint: disable=cyclic-import from morpheus.common import TypeId from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import MessageMeta logger = logging.getLogger(__name__) @@ -123,7 +122,13 @@ class WrappedFunctionSourceStage(_pipeline.SingleOutputSource): Function to use for computing the schema of the stage. """ - def __init__(self, config: Config, *, name: str, gen_fn: GeneratorType, compute_schema_fn: ComputeSchemaType): + def __init__(self, + config: Config, + *, + name: str, + gen_fn: GeneratorType, + compute_schema_fn: ComputeSchemaType, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, )): super().__init__(config) # collections.abc.Generator is a subclass of collections.abc.Iterator if not inspect.isgeneratorfunction(gen_fn): @@ -132,6 +137,7 @@ def __init__(self, config: Config, *, name: str, gen_fn: GeneratorType, compute_ self._name = name self._gen_fn = gen_fn self._compute_schema_fn = compute_schema_fn + self._supported_execution_modes = execution_modes @property def name(self) -> str: @@ -143,6 +149,12 @@ def supports_cpp_node(self) -> bool: def compute_schema(self, schema: _pipeline.StageSchema): self._compute_schema_fn(schema) + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return self._supported_execution_modes + def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: return builder.make_source(self.unique_name, self._gen_fn) @@ -172,7 +184,8 @@ def source( gen_fn: GeneratorType = None, *, name: str = None, - compute_schema_fn: ComputeSchemaType = None + compute_schema_fn: ComputeSchemaType = None, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, ) ) -> typing.Callable[typing.Concatenate[Config, _P], WrappedFunctionSourceStage]: """ Decorator for wrapping a function as a source stage. The function must be a generator method, and provide a @@ -196,7 +209,10 @@ def source( >>> pipe.set_source(source_gen(config, dataframes=[df])) """ if gen_fn is None: - return functools.partial(source, name=name, compute_schema_fn=compute_schema_fn) + return functools.partial(source, + name=name, + compute_schema_fn=compute_schema_fn, + execution_modes=execution_modes) # Use wraps to ensure user's don't lose their function name and docstrinsgs, however we do want to override the # annotations to reflect that the returned function requires a config and returns a stage @@ -236,18 +252,25 @@ def compute_schema_fn_inner(schema: _pipeline.StageSchema): bound_gen_fn = functools.partial(gen_fn, **kwargs) + pre_allocation_output_types = [pd.DataFrame, MessageMeta] + if config.execution_mode == ExecutionMode.GPU: + import cudf + pre_allocation_output_types.append(cudf.DataFrame) + # If the return type supports pre-allocation we use the pre-allocating source - if return_type in (pd.DataFrame, cudf.DataFrame, MessageMeta): + if return_type in pre_allocation_output_types: return PreAllocatedWrappedFunctionStage(config=config, name=name, gen_fn=bound_gen_fn, - compute_schema_fn=compute_schema_fn) + compute_schema_fn=compute_schema_fn, + execution_modes=execution_modes) return WrappedFunctionSourceStage(config=config, name=name, gen_fn=bound_gen_fn, - compute_schema_fn=compute_schema_fn) + compute_schema_fn=compute_schema_fn, + execution_modes=execution_modes) return wrapper @@ -276,16 +299,15 @@ class WrappedFunctionStage(_pipeline.SinglePortStage): by the `PreAllocatedWrappedFunctionStage` to ensure the DataFrame has the needed columns allocated. """ - def __init__( - self, - config: Config, - *, - name: str = None, - on_data_fn: typing.Callable, - accept_type: type, - compute_schema_fn: ComputeSchemaType, - needed_columns: dict[str, TypeId] = None, - ): + def __init__(self, + config: Config, + *, + name: str = None, + on_data_fn: typing.Callable, + accept_type: type, + compute_schema_fn: ComputeSchemaType, + needed_columns: dict[str, TypeId] = None, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, )): super().__init__(config) self._name = name self._on_data_fn = on_data_fn @@ -295,6 +317,8 @@ def __init__( if needed_columns is not None: self._needed_columns.update(needed_columns) + self._supported_execution_modes = execution_modes + @property def name(self) -> str: return self._name @@ -308,6 +332,12 @@ def supports_cpp_node(self) -> bool: def compute_schema(self, schema: _pipeline.StageSchema): self._compute_schema_fn(schema) + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return self._supported_execution_modes + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: node = builder.make_node(self.unique_name, ops.map(self._on_data_fn)) builder.make_edge(input_node, node) @@ -318,12 +348,15 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> DecoratedStageType = typing.Callable[typing.Concatenate[Config, _P], WrappedFunctionStage] -def stage(on_data_fn: typing.Callable[typing.Concatenate[_InputT, _P], _OutputT] = None, - *, - name: str = None, - accept_type: type = None, - compute_schema_fn: ComputeSchemaType = None, - needed_columns: dict[str, TypeId] = None) -> DecoratedStageType: +def stage( + on_data_fn: typing.Callable[typing.Concatenate[_InputT, _P], _OutputT] = None, + *, + name: str = None, + accept_type: type = None, + compute_schema_fn: ComputeSchemaType = None, + needed_columns: dict[str, TypeId] = None, + execution_modes: tuple[ExecutionMode] = (ExecutionMode.GPU, ) +) -> DecoratedStageType: """ Decorator for wrapping a function as a stage. The function must receive at least one argument, the first argument must be the incoming message, and must return a value. @@ -359,7 +392,8 @@ def stage(on_data_fn: typing.Callable[typing.Concatenate[_InputT, _P], _OutputT] name=name, accept_type=accept_type, compute_schema_fn=compute_schema_fn, - needed_columns=needed_columns) + needed_columns=needed_columns, + execution_modes=execution_modes) # Use wraps to ensure user's don't lose their function name and docstrinsgs, however we do want to override the # annotations to reflect that the returned function requires a config and returns a stage @@ -410,6 +444,7 @@ def compute_schema_fn_inner(schema: _pipeline.StageSchema): on_data_fn=bound_on_data_fn, accept_type=accept_type, compute_schema_fn=compute_schema_fn, - needed_columns=needed_columns) + needed_columns=needed_columns, + execution_modes=execution_modes) return wrapper diff --git a/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py b/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py index ad8db9ebc2..c1e42169c2 100644 --- a/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py +++ b/python/morpheus/morpheus/stages/boundary/linear_boundary_stage.py @@ -20,6 +20,7 @@ from morpheus.config import Config from morpheus.pipeline.boundary_stage_mixin import BoundaryStageMixin +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource @@ -29,7 +30,7 @@ logger = logging.getLogger(__name__) -class LinearBoundaryEgressStage(BoundaryStageMixin, PassThruTypeMixin, SinglePortStage): +class LinearBoundaryEgressStage(BoundaryStageMixin, PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ The LinearBoundaryEgressStage acts as an egress point from one linear segment to another. Given an existing linear pipeline that we want to connect to another segment, a linear boundary egress stage would be added, in conjunction @@ -82,7 +83,7 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> return input_node -class LinearBoundaryIngressStage(BoundaryStageMixin, PreallocatorMixin, SingleOutputSource): +class LinearBoundaryIngressStage(BoundaryStageMixin, PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ The LinearBoundaryIngressStage acts as source ingress point from a corresponding egress in another linear segment. Given an existing linear pipeline that we want to connect to another segment, a linear boundary egress stage would diff --git a/python/morpheus/morpheus/stages/general/monitor_stage.py b/python/morpheus/morpheus/stages/general/monitor_stage.py index cc3a96f33f..821fe729bd 100644 --- a/python/morpheus/morpheus/stages/general/monitor_stage.py +++ b/python/morpheus/morpheus/stages/general/monitor_stage.py @@ -22,6 +22,7 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.controllers.monitor_controller import MonitorController +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.logger import LogLevels @@ -30,7 +31,7 @@ @register_stage("monitor", ignore_args=["determine_count_fn"]) -class MonitorStage(PassThruTypeMixin, SinglePortStage): +class MonitorStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Display throughput numbers at a specific point in the pipeline. diff --git a/python/morpheus/morpheus/stages/general/multi_processing_stage.py b/python/morpheus/morpheus/stages/general/multi_processing_stage.py index 8011ae7591..fbe60f410a 100644 --- a/python/morpheus/morpheus/stages/general/multi_processing_stage.py +++ b/python/morpheus/morpheus/stages/general/multi_processing_stage.py @@ -22,6 +22,7 @@ import mrc.core.operators as ops from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils.shared_process_pool import SharedProcessPool @@ -229,6 +230,12 @@ def name(self) -> str: """Return the name of the stage.""" return self._name + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.GPU, ExecutionMode.CPU) + def _on_data(self, data: InputT) -> OutputT: task = self._shared_process_pool.submit_task(self.name, self._process_fn, data) result = task.result() diff --git a/python/morpheus/morpheus/stages/general/trigger_stage.py b/python/morpheus/morpheus/stages/general/trigger_stage.py index b8b754d910..3164a84b64 100644 --- a/python/morpheus/morpheus/stages/general/trigger_stage.py +++ b/python/morpheus/morpheus/stages/general/trigger_stage.py @@ -19,6 +19,7 @@ from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -26,7 +27,7 @@ @register_stage("trigger") -class TriggerStage(PassThruTypeMixin, SinglePortStage): +class TriggerStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Buffer data until the previous stage has completed. diff --git a/python/morpheus/morpheus/stages/inference/inference_stage.py b/python/morpheus/morpheus/stages/inference/inference_stage.py index d5c58937aa..4219064432 100644 --- a/python/morpheus/morpheus/stages/inference/inference_stage.py +++ b/python/morpheus/morpheus/stages/inference/inference_stage.py @@ -14,14 +14,12 @@ import logging import typing -from abc import abstractmethod from functools import partial import cupy as cp import mrc from mrc.core import operators as ops -import morpheus._lib.messages as _messages from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages.memory.tensor_memory import TensorMemory @@ -82,15 +80,14 @@ def build_output_message(self, msg: ControlMessage) -> ControlMessage: dims = self.calc_output_dims(msg) output_dims = (msg.payload().count, *dims[1:]) - memory = _messages.TensorMemory(count=output_dims[0], tensors={'probs': cp.zeros(output_dims)}) + memory = TensorMemory(count=output_dims[0], tensors={'probs': cp.zeros(output_dims)}) output_message = ControlMessage(msg) output_message.payload(msg.payload()) output_message.tensors(memory) return output_message - @abstractmethod - def calc_output_dims(self, msg: ControlMessage) -> typing.Tuple: + def calc_output_dims(self, msg: ControlMessage) -> tuple: """ Calculates the dimensions of the inference output message data given an input message. @@ -101,12 +98,11 @@ def calc_output_dims(self, msg: ControlMessage) -> typing.Tuple: Returns ------- - typing.Tuple + tuple Output dimensions of response. """ - pass + raise NotImplementedError("No Python implementation provided by this stage") - @abstractmethod def process(self, batch: ControlMessage, callback: typing.Callable[[TensorMemory], None]): """ Main inference processing function. This function will be called once for each mini-batch. Once the inference is @@ -121,7 +117,7 @@ def process(self, batch: ControlMessage, callback: typing.Callable[[TensorMemory Callback to set the values for the inference response. """ - pass + raise NotImplementedError("No Python implementation provided by this stage") class InferenceStage(ControlMessageStage): @@ -152,15 +148,21 @@ class InferenceStage(ControlMessageStage): ---------- c : `morpheus.config.Config` Pipeline configuration instance. - + thread_count : int, optional + Number of threads to use for inference. If not provided, the `num_threads` attribute of the `Config` object + will be used. """ - def __init__(self, c: Config): + def __init__(self, c: Config, thread_count: int = None): super().__init__(c) + # GPU only stage, assuming all messages are cuDF/CuPy based + import cudf + self._cudf = cudf + self._fea_length = c.feature_length - self._thread_count = c.num_threads + self._thread_count = thread_count or c.num_threads self._workers: typing.List[InferenceWorker] = [] self._inf_queue = ProducerConsumerQueue() @@ -173,13 +175,13 @@ def __init__(self, c: Config): def name(self) -> str: return "inference" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Accepted input types to this stage. Returns ------- - typing.Tuple + tuple Tuple of input types. """ return (ControlMessage, ) @@ -187,11 +189,10 @@ def accepted_types(self) -> typing.Tuple: def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(ControlMessage) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: # Default to False unless derived classes override this value return False - @abstractmethod def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> InferenceWorker: """ Returns the main inference worker which manages requests possibly in another thread depending on which mode the @@ -209,7 +210,7 @@ def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> InferenceWo `InferenceWorker` Inference worker implementation for stage. """ - pass + raise NotImplementedError("No Python implementation provided by this stage") def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: raise NotImplementedError("No C++ node is available for this inference type") @@ -327,7 +328,7 @@ def _split_batches(msg: ControlMessage, max_batch_size: int) -> typing.List[Cont out_msg.payload(msg.payload().get_slice(start, stop)) - out_msg_tensors = _messages.TensorMemory(count=stop - start, tensors={}) + out_msg_tensors = TensorMemory(count=stop - start, tensors={}) for (name, tensor) in msg.tensors().get_tensors().items(): out_msg_tensors.set_tensor(name, tensor[start:stop]) out_msg.tensors(out_msg_tensors) diff --git a/python/morpheus/morpheus/stages/inference/triton_inference_stage.py b/python/morpheus/morpheus/stages/inference/triton_inference_stage.py index a90fe6a983..62f0a51d8e 100644 --- a/python/morpheus/morpheus/stages/inference/triton_inference_stage.py +++ b/python/morpheus/morpheus/stages/inference/triton_inference_stage.py @@ -28,7 +28,6 @@ from tritonclient.utils import InferenceServerException from tritonclient.utils import triton_to_np_dtype -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes @@ -685,6 +684,9 @@ class TritonInferenceStage(InferenceStage): which will be inroduced as: inout_mapping={"mask": "input_mask", "output": "probs"} + thread_count : int, optional + Number of threads to use for inference. If not provided, the `num_threads` attribute of the `Config` object + will be used. """ _INFERENCE_WORKER_DEFAULT_INOUT_MAPPING = { @@ -711,8 +713,9 @@ def __init__(self, needs_logits: bool = None, inout_mapping: dict[str, str] = None, input_mapping: dict[str, str] = None, - output_mapping: dict[str, str] = None): - super().__init__(c) + output_mapping: dict[str, str] = None, + thread_count: int = None): + super().__init__(c, thread_count=thread_count) self._config = c @@ -781,6 +784,7 @@ def _get_inference_worker(self, inf_queue: ProducerConsumerQueue) -> TritonInfer needs_logits=self._needs_logits) def _get_cpp_inference_node(self, builder: mrc.Builder) -> mrc.SegmentObject: + import morpheus._lib.stages as _stages return _stages.InferenceClientStage(builder, self.unique_name, self._server_url, diff --git a/python/morpheus/morpheus/stages/input/appshield_source_stage.py b/python/morpheus/morpheus/stages/input/appshield_source_stage.py index acd22a54fa..e1e76b4023 100644 --- a/python/morpheus/morpheus/stages/input/appshield_source_stage.py +++ b/python/morpheus/morpheus/stages/input/appshield_source_stage.py @@ -16,7 +16,6 @@ import json import logging import re -import typing from functools import partial from json.decoder import JSONDecodeError @@ -27,8 +26,10 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes -from morpheus.messages.message_meta import AppShieldMessageMeta +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline import SingleOutputSource +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils.directory_watcher import DirectoryWatcher @@ -37,7 +38,7 @@ @register_stage("from-appshield", modes=[PipelineModes.FIL]) -class AppShieldSourceStage(PreallocatorMixin, SingleOutputSource): +class AppShieldSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Source stage is used to load Appshield messages from one or more plugins into a dataframe. It normalizes nested json messages and arranges them into a dataframe by snapshot @@ -77,9 +78,9 @@ class AppShieldSourceStage(PreallocatorMixin, SingleOutputSource): def __init__(self, c: Config, input_glob: str, - plugins_include: typing.List[str], - cols_include: typing.List[str], - cols_exclude: typing.List[str] = None, + plugins_include: list[str], + cols_include: list[str], + cols_exclude: list[str] = None, watch_directory: bool = False, max_files: int = -1, sort_glob: bool = False, @@ -102,6 +103,9 @@ def __init__(self, self._input_count = None + import cudf + self._cudf = cudf + self._watcher = DirectoryWatcher(input_glob=input_glob, watch_directory=watch_directory, max_files=max_files, @@ -124,10 +128,10 @@ def supports_cpp_node(self): return False def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(AppShieldMessageMeta) + schema.output_schema.set_type(ControlMessage) @staticmethod - def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: typing.List[str]): + def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: list[str]): """ Fill missing interested plugin columns. @@ -135,7 +139,7 @@ def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: typing.List[str] ---------- plugin_df : pandas.DataFrame Snapshot plugin dataframe - cols_include : typing.List[str] + cols_include : list[str] Columns that needs to be included. Returns @@ -152,7 +156,7 @@ def fill_interested_cols(plugin_df: pd.DataFrame, cols_include: typing.List[str] return plugin_df @staticmethod - def read_file_to_df(file: io.TextIOWrapper, cols_exclude: typing.List[str]): + def read_file_to_df(file: io.TextIOWrapper, cols_exclude: list[str]): """ Read file content to dataframe. @@ -160,7 +164,7 @@ def read_file_to_df(file: io.TextIOWrapper, cols_exclude: typing.List[str]): ---------- file : `io.TextIOWrapper` Input file object - cols_exclude : typing.List[str] + cols_exclude : list[str] Dropping columns from a dataframe. Returns @@ -185,7 +189,7 @@ def read_file_to_df(file: io.TextIOWrapper, cols_exclude: typing.List[str]): return plugin_df @staticmethod - def load_df(filepath: str, cols_exclude: typing.List[str], encoding: str) -> pd.DataFrame: + def load_df(filepath: str, cols_exclude: list[str], encoding: str) -> pd.DataFrame: """ Reads a file into a dataframe. @@ -193,7 +197,7 @@ def load_df(filepath: str, cols_exclude: typing.List[str], encoding: str) -> pd. ---------- filepath : str Path to a file. - cols_exclude : typing.List[str] + cols_exclude : list[str] Columns that needs to exclude. encoding : str Encoding to read a file. @@ -228,13 +232,13 @@ def load_df(filepath: str, cols_exclude: typing.List[str], encoding: str) -> pd. return plugin_df @staticmethod - def load_meta_cols(filepath_split: typing.List[str], plugin: str, plugin_df: pd.DataFrame) -> pd.DataFrame: + def load_meta_cols(filepath_split: list[str], plugin: str, plugin_df: pd.DataFrame) -> pd.DataFrame: """ Loads meta columns to dataframe. Parameters ---------- - filepath_split : typing.List[str] + filepath_split : list[str] Splits of file path. plugin : str Plugin name to which the data belongs to. @@ -268,20 +272,20 @@ def load_meta_cols(filepath_split: typing.List[str], plugin: str, plugin_df: pd. return plugin_df @staticmethod - def batch_source_split(x: typing.List[pd.DataFrame], source: str) -> typing.Dict[str, pd.DataFrame]: + def batch_source_split(x: list[pd.DataFrame], source: str) -> dict[str, pd.DataFrame]: """ Combines plugin dataframes from multiple snapshot and split dataframe per source. Parameters ---------- - x : typing.List[pd.DataFrame] + x : list[pd.DataFrame] Dataframes from multiple sources. source : str source column name to group it. Returns ------- - typing.Dict[str, pandas.DataFrame] + dict[str, pandas.DataFrame] Grouped dataframes by source. """ @@ -301,30 +305,30 @@ def batch_source_split(x: typing.List[pd.DataFrame], source: str) -> typing.Dict return source_dfs @staticmethod - def files_to_dfs(x: typing.List[str], - cols_include: typing.List[str], - cols_exclude: typing.List[str], - plugins_include: typing.List[str], - encoding: str) -> typing.Dict[str, pd.DataFrame]: + def files_to_dfs(x: list[str], + cols_include: list[str], + cols_exclude: list[str], + plugins_include: list[str], + encoding: str) -> dict[str, pd.DataFrame]: """ Load plugin files into a dataframe, then segment the dataframe by source. Parameters ---------- - x : typing.List[str] + x : list[str] List of file paths. - cols_include : typing.List[str] + cols_include : list[str] Columns that needs to include. - cols_exclude : typing.List[str] + cols_exclude : list[str] Columns that needs to exclude. - plugins_include: typing.List[str] + plugins_include: list[str] For each path in `x`, a list of plugins to load additional meta cols from. encoding : str Encoding to read a file. Returns ------- - typing.Dict[str, pandas.DataFrame] + dict[str, pandas.DataFrame] Grouped dataframes by source. """ # Using pandas to parse nested JSON until cuDF adds support @@ -348,18 +352,19 @@ def files_to_dfs(x: typing.List[str], return df_per_source - @staticmethod - def _build_metadata(x: typing.Dict[str, pd.DataFrame]): + def _build_messages(self, source_dfs: dict[str, pd.DataFrame]): - metas = [] + output_messages = [] - for source, df in x.items(): + for source, df in source_dfs.items(): - # Now make a AppShieldMessageMeta with the source name - meta = AppShieldMessageMeta(df, source) - metas.append(meta) + # Now make a message with the source name + cm = ControlMessage() + cm.payload(MessageMeta(self._cudf.DataFrame(df))) + cm.set_metadata("source", source) + output_messages.append(cm) - return metas + return output_messages def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: # The first source just produces filenames @@ -376,8 +381,8 @@ def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) cols_exclude=self._cols_exclude, plugins_include=self._plugins_include, encoding=self._encoding)), - ops.map(self._build_metadata), - # Finally flatten to single meta + ops.map(self._build_messages), + # Emit each message individually ops.flatten()) builder.make_edge(out_node, post_node) diff --git a/python/morpheus/morpheus/stages/input/arxiv_source.py b/python/morpheus/morpheus/stages/input/arxiv_source.py index b995d3c6b8..34fb3582b3 100644 --- a/python/morpheus/morpheus/stages/input/arxiv_source.py +++ b/python/morpheus/morpheus/stages/input/arxiv_source.py @@ -20,11 +20,11 @@ import mrc.core.operators as ops import pandas as pd -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -41,7 +41,7 @@ @register_stage("from-arxiv") -class ArxivSource(PreallocatorMixin, SingleOutputSource): +class ArxivSource(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage that downloads PDFs from arxiv and converts them to dataframes. @@ -98,6 +98,10 @@ def __init__(self, self._total_chunks = 0 self._cache_dir = cache_dir + if c.execution_mode == ExecutionMode.GPU: + import cudf + self._cudf = cudf + @property def name(self) -> str: """Return the name of the stage""" @@ -195,4 +199,7 @@ def _splitting_pages(self, documents: list["Document"]): df.rename(columns=map_cols, inplace=True) - return MessageMeta(cudf.from_pandas(df)) + if self._config.execution_mode == ExecutionMode.GPU: + df = self._cudf.from_pandas(df) + + return MessageMeta(df) diff --git a/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py b/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py index 2c3c400c96..06f0d47b47 100644 --- a/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py +++ b/python/morpheus/morpheus/stages/input/databricks_deltalake_source_stage.py @@ -16,11 +16,11 @@ import mrc -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages.message_meta import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -39,7 +39,7 @@ @register_stage("from-databricks-deltalake") -class DataBricksDeltaLakeSourceStage(PreallocatorMixin, SingleOutputSource): +class DataBricksDeltaLakeSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage used to load messages from a DeltaLake table. @@ -77,6 +77,10 @@ def __init__(self, self.items_per_page = items_per_page self.offset = 0 + if config.execution_mode == ExecutionMode.GPU: + import cudf + self._cudf = cudf + @property def name(self) -> str: return "from-databricks-deltalake" @@ -104,7 +108,14 @@ def source_generator(self, subscription: mrc.Subscription): str(self.offset), str(self.offset + self.items_per_page + 1)) self.offset += self.items_per_page + 1 - yield MessageMeta(df=cudf.from_pandas(df.toPandas().drop(["_id"], axis=1))) + + df = df.toPandas().drop(["_id"], axis=1) + + if self._config.execution_mode == ExecutionMode.GPU: + df = self._cudf.from_pandas(df) + + yield MessageMeta(df=df) + except Exception as e: logger.error( "Error occurred while reading data from \ diff --git a/python/morpheus/morpheus/stages/input/file_source_stage.py b/python/morpheus/morpheus/stages/input/file_source_stage.py index 675bc5e94b..398c1b126e 100644 --- a/python/morpheus/morpheus/stages/input/file_source_stage.py +++ b/python/morpheus/morpheus/stages/input/file_source_stage.py @@ -19,14 +19,13 @@ import mrc -# pylint: disable=morpheus-incorrect-lib-from-import -from morpheus._lib.messages import MessageMeta as CppMessageMeta from morpheus.cli import register_stage from morpheus.common import FileTypes from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.io.deserializers import read_file_to_df from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -35,7 +34,7 @@ @register_stage("from-file", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER]) -class FileSourceStage(PreallocatorMixin, SingleOutputSource): +class FileSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Load messages from a file. @@ -140,17 +139,14 @@ def _generate_frames(self, subscription: mrc.Subscription) -> typing.Iterable[Me filter_nulls=self._filter_null, filter_null_columns=self._filter_null_columns, parser_kwargs=self._parser_kwargs, - df_type="cudf", + df_type=self.df_type_str, ) for i in range(self._repeat_count): if not subscription.is_subscribed(): break - if (self._build_cpp_node()): - x = CppMessageMeta(df) - else: - x = MessageMeta(df) + x = MessageMeta(df) # If we are looping, copy the object. Do this before we push the object in case it changes if (i + 1 < self._repeat_count): diff --git a/python/morpheus/morpheus/stages/input/http_client_source_stage.py b/python/morpheus/morpheus/stages/input/http_client_source_stage.py index 73e9460627..cc49912467 100644 --- a/python/morpheus/morpheus/stages/input/http_client_source_stage.py +++ b/python/morpheus/morpheus/stages/input/http_client_source_stage.py @@ -21,21 +21,22 @@ import mrc import requests -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.io.utils import get_json_reader from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils import http_utils +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @register_stage("from-http-client", ignore_args=["query_params", "headers", "**request_kwargs"]) -class HttpClientSourceStage(PreallocatorMixin, SingleOutputSource): +class HttpClientSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage that polls a remote HTTP server for incoming data. @@ -82,7 +83,8 @@ class HttpClientSourceStage(PreallocatorMixin, SingleOutputSource): Stops ingesting after emitting `stop_after` records (rows in the dataframe). Useful for testing. Disabled if `0` payload_to_df_fn : callable, default None A callable that takes the HTTP payload bytes as the first argument and the `lines` parameter is passed in as - the second argument and returns a cudf.DataFrame. If unset cudf.read_json is used. + the second argument and returns a DataFrame. If unset `cudf.read_json` is used in GPU mode and + `pandas.read_json` in CPU mode. **request_kwargs : dict Additional arguments to pass to the `requests.request` function. """ @@ -101,7 +103,7 @@ def __init__(self, max_retries: int = 10, lines: bool = False, stop_after: int = 0, - payload_to_df_fn: typing.Callable[[bytes, bool], cudf.DataFrame] = None, + payload_to_df_fn: typing.Callable[[bytes, bool], DataFrameType] = None, **request_kwargs): super().__init__(config) self._url = http_utils.prepare_url(url) @@ -139,9 +141,14 @@ def __init__(self, self._stop_after = stop_after self._lines = lines - self._payload_to_df_fn = payload_to_df_fn self._requst_kwargs = request_kwargs + if payload_to_df_fn is not None: + self._payload_to_df_fn = payload_to_df_fn + else: + reader = get_json_reader(self._config.execution_mode) + self._payload_to_df_fn = lambda payload, lines: reader(payload, lines=lines) + @property def name(self) -> str: """Unique name of the stage""" @@ -154,16 +161,13 @@ def supports_cpp_node(self) -> bool: def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(MessageMeta) - def _parse_response(self, response: requests.Response) -> typing.Union[cudf.DataFrame, None]: + def _parse_response(self, response: requests.Response) -> typing.Union[DataFrameType, None]: """ Returns a DataFrame parsed from the response payload. If the response payload is empty, then `None` is returned. """ payload = response.content - if self._payload_to_df_fn is not None: - return self._payload_to_df_fn(payload, self._lines) - - return cudf.read_json(payload, lines=self._lines, engine='cudf') + return self._payload_to_df_fn(payload, self._lines) def _generate_frames(self, subscription: mrc.Subscription) -> typing.Iterator[MessageMeta]: # Running counter of the number of messages emitted by this source diff --git a/python/morpheus/morpheus/stages/input/http_server_source_stage.py b/python/morpheus/morpheus/stages/input/http_server_source_stage.py index 8bf22084cf..c286c2dcd2 100644 --- a/python/morpheus/morpheus/stages/input/http_server_source_stage.py +++ b/python/morpheus/morpheus/stages/input/http_server_source_stage.py @@ -23,11 +23,11 @@ import mrc -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config +from morpheus.io.utils import get_json_reader from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -35,6 +35,7 @@ from morpheus.utils.http_utils import HttpParseResponse from morpheus.utils.http_utils import MimeTypes from morpheus.utils.producer_consumer_queue import Closed +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @@ -43,7 +44,7 @@ @register_stage("from-http") -class HttpServerSourceStage(PreallocatorMixin, SingleOutputSource): +class HttpServerSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Source stage that starts an HTTP server and listens for incoming requests on a specified endpoint. @@ -81,7 +82,7 @@ class HttpServerSourceStage(PreallocatorMixin, SingleOutputSource): Stops ingesting after emitting `stop_after` records (rows in the dataframe). Useful for testing. Disabled if `0` payload_to_df_fn : callable, default None A callable that takes the HTTP payload string as the first argument and the `lines` parameter is passed in as - the second argument and returns a cudf.DataFrame. When supplied, the C++ implementation of this stage is + the second argument and returns a DataFrame. When supplied, the C++ implementation of this stage is disabled, and the Python impl is used. """ @@ -104,7 +105,7 @@ def __init__(self, request_timeout_secs: int = 30, lines: bool = False, stop_after: int = 0, - payload_to_df_fn: typing.Callable[[str, bool], cudf.DataFrame] = None): + payload_to_df_fn: typing.Callable[[str, bool], DataFrameType] = None): super().__init__(config) self._bind_address = bind_address self._port = port @@ -123,9 +124,11 @@ def __init__(self, self._request_timeout_secs = request_timeout_secs self._lines = lines self._stop_after = stop_after - self._payload_to_df_fn = payload_to_df_fn self._http_server = None + # Leave this as None so we can check if it's set later + self._payload_to_df_fn = payload_to_df_fn + # These are only used when C++ mode is disabled self._queue = None self._queue_size = 0 @@ -163,12 +166,7 @@ def stop(self): def _parse_payload(self, payload: str) -> HttpParseResponse: try: - if self._payload_to_df_fn is not None: - df = self._payload_to_df_fn(payload, self._lines) - else: - # engine='cudf' is needed when lines=False to avoid using pandas - df = cudf.read_json(StringIO(initial_value=payload), lines=self._lines, engine='cudf') - + df = self._payload_to_df_fn(payload, self._lines) except Exception as e: err_msg = "Error occurred converting HTTP payload to Dataframe" logger.error("%s: %s", err_msg, e) @@ -250,7 +248,8 @@ def _generate_frames(self, subscription: mrc.Subscription) -> typing.Iterator[Me # shutdown since we already returned an OK response to the client. df = None try: - df = self._queue.get(block=False) + # Intentionally not using self._queue_timeout here since that value is rather high + df = self._queue.get(block=False, timeout=0.1) self._queue_size -= 1 except queue.Empty: if (not self._http_server.is_running() or self.is_stop_requested() @@ -270,6 +269,10 @@ def _generate_frames(self, subscription: mrc.Subscription) -> typing.Iterator[Me if self._stop_after > 0 and self._records_emitted >= self._stop_after: self._processing = False + def _set_default_payload_to_df_fn(self): + reader = get_json_reader(self._config.execution_mode) + self._payload_to_df_fn = lambda payload, lines: reader(StringIO(initial_value=payload), lines=lines) + def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: if self._build_cpp_node() and self._payload_to_df_fn is None: import morpheus._lib.stages as _stages @@ -289,6 +292,9 @@ def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: lines=self._lines, stop_after=self._stop_after) else: + if self._payload_to_df_fn is None: + self._set_default_payload_to_df_fn() + node = builder.make_source(self.unique_name, self._generate_frames) return node diff --git a/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py b/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py index c9630549d6..4139ab41ed 100644 --- a/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py +++ b/python/morpheus/morpheus/stages/input/in_memory_data_generation_stage.py @@ -18,6 +18,7 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -27,7 +28,7 @@ | typing.Callable[[], typing.Iterable[typing.Any]]) -class InMemoryDataGenStage(SingleOutputSource): +class InMemoryDataGenStage(GpuAndCpuMixin, SingleOutputSource): """ Source stage that generates data in-memory using a provided iterable or generator function. @@ -54,7 +55,7 @@ def compute_schema(self, schema: StageSchema): # Set the output schema based on the OutputDataType schema.output_schema.set_type(self._output_data_type) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: diff --git a/python/morpheus/morpheus/stages/input/in_memory_source_stage.py b/python/morpheus/morpheus/stages/input/in_memory_source_stage.py index c977845eaa..726a1e40b1 100644 --- a/python/morpheus/morpheus/stages/input/in_memory_source_stage.py +++ b/python/morpheus/morpheus/stages/input/in_memory_source_stage.py @@ -16,13 +16,12 @@ import mrc -import cudf - from morpheus.config import Config from morpheus.messages import MessageMeta from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.stage_schema import StageSchema from morpheus.stages.input.in_memory_data_generation_stage import InMemoryDataGenStage +from morpheus.utils.type_aliases import DataFrameType class InMemorySourceStage(PreallocatorMixin, InMemoryDataGenStage): @@ -33,13 +32,13 @@ class InMemorySourceStage(PreallocatorMixin, InMemoryDataGenStage): ---------- c : `morpheus.config.Config` Pipeline configuration instance. - dataframes : typing.List[cudf.DataFrame] + dataframes : list[DataFrameType] List of dataframes to emit wrapped in `MessageMeta` instances in order. repeat : int, default = 1, min = 1 Repeats the input dataset multiple times. Useful to extend small datasets for debugging. """ - def __init__(self, c: Config, dataframes: typing.List[cudf.DataFrame], repeat: int = 1): + def __init__(self, c: Config, dataframes: list[DataFrameType], repeat: int = 1): # Prepare a generator function based on the provided dataframes and repeat count self._dataframes = dataframes self._repeat_count = repeat diff --git a/python/morpheus/morpheus/stages/input/kafka_source_stage.py b/python/morpheus/morpheus/stages/input/kafka_source_stage.py index 275418b72b..8770bd91ea 100644 --- a/python/morpheus/morpheus/stages/input/kafka_source_stage.py +++ b/python/morpheus/morpheus/stages/input/kafka_source_stage.py @@ -22,17 +22,17 @@ import mrc import pandas as pd -import cudf - -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.config import auto_determine_bootstrap +from morpheus.io.utils import get_json_reader from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @@ -45,7 +45,7 @@ class AutoOffsetReset(Enum): @register_stage("from-kafka", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER]) -class KafkaSourceStage(PreallocatorMixin, SingleOutputSource): +class KafkaSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): """ Load messages from a Kafka cluster. @@ -128,6 +128,9 @@ def __init__(self, self._poll_interval = pd.Timedelta(poll_interval).total_seconds() self._started = False + # Defined lated if in CPU mode + self._json_reader: typing.Callable[..., DataFrameType] = None + self._records_emitted = 0 self._num_messages = 0 @@ -155,7 +158,7 @@ def _process_batch(self, consumer, batch): df = None try: buffer.seek(0) - df = cudf.io.read_json(buffer, engine='cudf', lines=True, orient='records') + df = self._json_reader(buffer, lines=True, orient='records') except Exception as e: logger.error("Error parsing payload into a dataframe : %s", e) finally: @@ -226,6 +229,7 @@ def _source_generator(self, subscription: mrc.Subscription): def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: if (self._build_cpp_node()): + import morpheus._lib.stages as _stages source = _stages.KafkaSourceStage(builder, self.unique_name, self._max_batch_size, @@ -241,6 +245,7 @@ def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: # multiple threads source.launch_options.pe_count = self._max_concurrent else: + self._json_reader = get_json_reader(self._config.execution_mode) source = builder.make_source(self.unique_name, self._source_generator) return source diff --git a/python/morpheus/morpheus/stages/input/rss_source_stage.py b/python/morpheus/morpheus/stages/input/rss_source_stage.py index c9d9d01ac3..a5dc473189 100644 --- a/python/morpheus/morpheus/stages/input/rss_source_stage.py +++ b/python/morpheus/morpheus/stages/input/rss_source_stage.py @@ -20,6 +20,7 @@ from morpheus.config import Config from morpheus.controllers.rss_controller import RSSController from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema @@ -28,7 +29,7 @@ @register_stage("from-rss") -class RSSSourceStage(PreallocatorMixin, SingleOutputSource): +class RSSSourceStage(GpuAndCpuMixin, PreallocatorMixin, SingleOutputSource): """ Load RSS feed items into a DataFrame. @@ -82,13 +83,14 @@ def __init__(self, strip_markup=strip_markup, stop_after=stop_after, interval_secs=interval_secs, - should_stop_fn=self.is_stop_requested) + should_stop_fn=self.is_stop_requested, + df_type=self.df_type_str) @property def name(self) -> str: return "from-rss" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False def compute_schema(self, schema: StageSchema): diff --git a/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py b/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py index 86ae3dc6ce..ab7ec49f40 100644 --- a/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py +++ b/python/morpheus/morpheus/stages/output/compare_dataframe_stage.py @@ -21,14 +21,13 @@ import pandas as pd -import cudf - from morpheus.config import Config from morpheus.io.deserializers import read_file_to_df from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage from morpheus.utils import compare_df as compare_df_module from morpheus.utils import concat_df from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type class CompareDataFrameStage(InMemorySinkStage): @@ -74,8 +73,6 @@ def __init__(self, if isinstance(compare_df, str): compare_df = read_file_to_df(compare_df, df_type='pandas') - elif isinstance(compare_df, cudf.DataFrame): - compare_df = compare_df.to_pandas() elif isinstance(compare_df, list): tmp_dfs = [] for item in compare_df: @@ -83,6 +80,8 @@ def __init__(self, tmp_dfs.append(tmp_df) compare_df = pd.concat(tmp_dfs) compare_df.reset_index(inplace=True, drop=True) + elif is_cudf_type(compare_df): + compare_df = compare_df.to_pandas() self._compare_df = compare_df diff --git a/python/morpheus/morpheus/stages/output/http_client_sink_stage.py b/python/morpheus/morpheus/stages/output/http_client_sink_stage.py index a9cb872b4c..083a97b9ce 100644 --- a/python/morpheus/morpheus/stages/output/http_client_sink_stage.py +++ b/python/morpheus/morpheus/stages/output/http_client_sink_stage.py @@ -25,6 +25,7 @@ from morpheus.config import Config from morpheus.io import serializers from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils import http_utils @@ -36,7 +37,7 @@ @register_stage("to-http", ignore_args=["query_params", "headers", "df_to_request_kwargs_fn", "**request_kwargs"]) -class HttpClientSinkStage(PassThruTypeMixin, SinglePortStage): +class HttpClientSinkStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Write all messages to an HTTP endpoint. diff --git a/python/morpheus/morpheus/stages/output/http_server_sink_stage.py b/python/morpheus/morpheus/stages/output/http_server_sink_stage.py index dcf59d3864..448285f018 100644 --- a/python/morpheus/morpheus/stages/output/http_server_sink_stage.py +++ b/python/morpheus/morpheus/stages/output/http_server_sink_stage.py @@ -22,15 +22,13 @@ from io import StringIO import mrc -import pandas as pd from mrc.core import operators as ops -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.io import serializers from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.http_utils import HTTPMethod @@ -42,7 +40,7 @@ @register_stage("to-http-server", ignore_args=["df_serializer_fn"]) -class HttpServerSinkStage(PassThruTypeMixin, SinglePortStage): +class HttpServerSinkStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Sink stage that starts an HTTP server and listens for incoming requests on a specified endpoint. @@ -116,6 +114,8 @@ def __init__(self, self._df_serializer_fn = df_serializer_fn or self._default_df_serializer + self._df_pkg = self.get_df_pkg() + # FiberQueue doesn't have a way to check the size, nor does it have a way to check if it's empty without # attempting to perform a read. We'll keep track of the size ourselves. self._queue = queue.Queue(maxsize=max_queue_size or config.edge_buffer_size) @@ -201,10 +201,10 @@ def _request_handler(self, _: str) -> HttpParseResponse: body=err_msg) if (len(data_frames) > 0): - df = data_frames[0] if len(data_frames) > 1: - cat_fn = pd.concat if isinstance(df, pd.DataFrame) else cudf.concat - df = cat_fn(data_frames) + df = self._df_pkg.concat(data_frames) + else: + df = data_frames[0] return HttpParseResponse(status_code=HTTPStatus.OK.value, content_type=self._content_type, diff --git a/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py b/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py index ea2998ea3c..f81a61c169 100644 --- a/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py +++ b/python/morpheus/morpheus/stages/output/in_memory_sink_stage.py @@ -18,11 +18,12 @@ import mrc.core.operators as ops from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class InMemorySinkStage(PassThruTypeMixin, SinglePortStage): +class InMemorySinkStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Collects incoming messages into a list that can be accessed after the pipeline is complete. Useful for testing. diff --git a/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py b/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py index 6b98ffeb92..53d028d987 100644 --- a/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_databricks_deltalake_stage.py @@ -19,8 +19,6 @@ import pandas as pd from mrc.core import operators as ops -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import MessageMeta @@ -97,8 +95,9 @@ def write_to_deltalake(meta: MessageMeta): convert cudf to spark dataframe """ df = meta.copy_dataframe() - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): df = df.to_pandas() + schema = self._extract_schema_from_pandas_dataframe(df) spark_df = self.spark.createDataFrame(df, schema=schema) spark_df.write \ diff --git a/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py b/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py index eede6926e8..f26948cf6a 100644 --- a/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_elasticsearch_stage.py @@ -18,10 +18,9 @@ import mrc import mrc.core.operators as ops +import pandas as pd import yaml -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.controllers.elasticsearch_controller import ElasticsearchController @@ -110,7 +109,7 @@ def on_data(meta: MessageMeta) -> MessageMeta: self._controller.refresh_client() df = meta.copy_dataframe() - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): df = df.to_pandas() logger.debug("Converted cudf of size: %s to pandas dataframe.", len(df)) diff --git a/python/morpheus/morpheus/stages/output/write_to_file_stage.py b/python/morpheus/morpheus/stages/output/write_to_file_stage.py index 46b7e5cec6..9f3298bc61 100644 --- a/python/morpheus/morpheus/stages/output/write_to_file_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_file_stage.py @@ -18,18 +18,18 @@ import mrc import mrc.core.operators as ops -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.common import FileTypes from morpheus.config import Config from morpheus.controllers.write_to_file_controller import WriteToFileController from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @register_stage("to-file", rename_options={"include_index_col": "--include-index-col"}) -class WriteToFileStage(PassThruTypeMixin, SinglePortStage): +class WriteToFileStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Write all messages to a file. @@ -92,6 +92,7 @@ def supports_cpp_node(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: # Sink to file if (self._build_cpp_node()): + import morpheus._lib.stages as _stages to_file_node = _stages.WriteToFileStage(builder, self.unique_name, self._controller.output_file, diff --git a/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py b/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py index 3546a14563..ad7954f977 100644 --- a/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py +++ b/python/morpheus/morpheus/stages/output/write_to_kafka_stage.py @@ -24,6 +24,7 @@ from morpheus.config import Config from morpheus.io import serializers from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -31,7 +32,7 @@ @register_stage("to-kafka") -class WriteToKafkaStage(PassThruTypeMixin, SinglePortStage): +class WriteToKafkaStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Write all messages to a Kafka cluster. diff --git a/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py b/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py index 5937a2077b..e4ab126cdd 100644 --- a/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/add_classifications_stage.py @@ -63,7 +63,7 @@ def __init__(self, def name(self) -> str: return "add-class" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: # Enable support by default return True diff --git a/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py b/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py index 75c796ee3a..bb55d1f3b9 100644 --- a/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py +++ b/python/morpheus/morpheus/stages/postprocess/add_scores_stage_base.py @@ -23,13 +23,14 @@ from morpheus.common import TypeId from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage logger = logging.getLogger(__name__) -class AddScoresStageBase(PassThruTypeMixin, SinglePortStage): +class AddScoresStageBase(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Base class for the `AddScoresStage` and `AddClassificationStage` diff --git a/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py b/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py index 925d0deb73..45fc41ef56 100644 --- a/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/filter_detections_stage.py @@ -18,12 +18,12 @@ import mrc from mrc.core import operators as ops -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.common import FilterSource from morpheus.config import Config from morpheus.controllers.filter_detections_controller import FilterDetectionsController from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema @@ -31,7 +31,7 @@ @register_stage("filter") -class FilterDetectionsStage(SinglePortStage): +class FilterDetectionsStage(GpuAndCpuMixin, SinglePortStage): """ Filter message by a classification threshold. @@ -113,6 +113,7 @@ def supports_cpp_node(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: if self._build_cpp_node(): + import morpheus._lib.stages as _stages node = _stages.FilterDetectionsStage(builder, self.unique_name, self._controller.threshold, diff --git a/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py b/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py index 7e62870138..25f02d4ea0 100644 --- a/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/generate_viz_frames_stage.py @@ -27,22 +27,22 @@ import websockets.legacy.server from websockets.server import serve -import cudf - from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.producer_consumer_queue import AsyncIOProducerConsumerQueue from morpheus.utils.producer_consumer_queue import Closed +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @register_stage("gen-viz", modes=[PipelineModes.NLP], command_args={"deprecated": True}) -class GenerateVizFramesStage(PassThruTypeMixin, SinglePortStage): +class GenerateVizFramesStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Write out visualization DataFrames. @@ -81,6 +81,8 @@ def __init__(self, self._server_task: asyncio.Task = None self._server_close_event: asyncio.Event = None + self._df_class: type[DataFrameType] = self.get_df_class() + @property def name(self) -> str: return "gen_viz" @@ -142,7 +144,7 @@ def indent_data(y: str): except Exception: return y - if isinstance(df, cudf.DataFrame): + if not isinstance(df, pd.DataFrame): df = df.to_pandas() df["data"] = df["data"].apply(indent_data) @@ -278,7 +280,7 @@ def write_batch(msg: ControlMessage): columns = ["timestamp", "src_ip", "dest_ip", "secret_keys", "data"] df = msg.payload().get_data(columns) - out_df = cudf.DataFrame() + out_df = self._df_class() out_df["dt"] = (df["timestamp"] - time0).astype(np.int32) out_df["src"] = df["src_ip"].str.ip_to_int().astype(np.uint32) diff --git a/python/morpheus/morpheus/stages/postprocess/serialize_stage.py b/python/morpheus/morpheus/stages/postprocess/serialize_stage.py index ba61d9274b..47afb85082 100644 --- a/python/morpheus/morpheus/stages/postprocess/serialize_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/serialize_stage.py @@ -19,12 +19,12 @@ import mrc from mrc.core import operators as ops -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.controllers.serialize_controller import SerializeController from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema @@ -32,7 +32,7 @@ @register_stage("serialize") -class SerializeStage(SinglePortStage): +class SerializeStage(GpuAndCpuMixin, SinglePortStage): """ Includes & excludes columns from messages. @@ -91,6 +91,7 @@ def supports_cpp_node(self): def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: if (self._build_cpp_node()): + import morpheus._lib.stages as _stages node = _stages.SerializeStage(builder, self.unique_name, self._controller.include_columns or [], diff --git a/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py b/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py index 493c57a6df..5d7e5d5a67 100644 --- a/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py @@ -350,7 +350,9 @@ def _calc_timeseries(self, x: ControlMessage, is_complete: bool): # Save this message in the pending queue self._pending_messages.append(x) - new_timedata = x.payload().get_data([self._timestamp_col]).to_pandas() + new_timedata = x.payload().get_data([self._timestamp_col]) + if not isinstance(new_timedata, pd.DataFrame): + new_timedata = new_timedata.to_pandas() # Save this message event times in the event list. Ensure the values are always sorted self._timeseries_data = pd.concat([self._timeseries_data, new_timedata]).sort_index() diff --git a/python/morpheus/morpheus/stages/postprocess/validation_stage.py b/python/morpheus/morpheus/stages/postprocess/validation_stage.py index 99da57b36d..e39c814136 100644 --- a/python/morpheus/morpheus/stages/postprocess/validation_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/validation_stage.py @@ -30,7 +30,7 @@ @register_stage("validate") -class ValidationStage(CompareDataFrameStage): +class ValidationStage(CompareDataFrameStage): # pylint: disable=too-many-ancestors """ Validate pipeline output for testing. diff --git a/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py b/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py index 8c0eaf17fe..7605be4f79 100644 --- a/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/deserialize_stage.py @@ -25,6 +25,7 @@ from morpheus.messages import MessageMeta from morpheus.modules.preprocess.deserialize import DeserializeLoaderFactory from morpheus.pipeline.control_message_stage import ControlMessageStage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.stage_schema import StageSchema logger = logging.getLogger(__name__) @@ -33,7 +34,7 @@ @register_stage("deserialize", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER], ignore_args=["task_type", "task_payload"]) -class DeserializeStage(ControlMessageStage): +class DeserializeStage(GpuAndCpuMixin, ControlMessageStage): """ Messages are logically partitioned based on the pipeline config's `pipeline_batch_size` parameter. @@ -77,15 +78,6 @@ def __init__(self, if ((self._task_type is None) != (self._task_payload is None)): raise ValueError("Both `task_type` and `task_payload` must be specified if either is specified.") - self._module_config = { - "ensure_sliceable_index": self._ensure_sliceable_index, - "task_type": self._task_type, - "task_payload": self._task_payload, - "batch_size": self._batch_size, - "max_concurrency": self._max_concurrent, - "should_log_timestamp": self._should_log_timestamps - } - @property def name(self) -> str: return "deserialize" @@ -116,8 +108,17 @@ def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> builder.make_edge(input_node, out_node) else: + module_config = { + "ensure_sliceable_index": self._ensure_sliceable_index, + "task_type": self._task_type, + "task_payload": self._task_payload, + "batch_size": self._batch_size, + "max_concurrency": self._max_concurrent, + "should_log_timestamp": self._should_log_timestamps + } + module_loader = DeserializeLoaderFactory.get_instance(module_name=f"deserialize_{self.unique_name}", - module_config=self._module_config) + module_config=module_config) module = module_loader.load(builder=builder) mod_in_node = module.input_port("input") diff --git a/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py b/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py index 697cce089a..7926aeb8d4 100644 --- a/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/drop_null_stage.py @@ -12,21 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import typing - import mrc from mrc.core import operators as ops from morpheus.cli.register_stage import register_stage from morpheus.config import Config -from morpheus.config import PipelineModes from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -@register_stage("dropna", modes=[PipelineModes.FIL, PipelineModes.NLP, PipelineModes.OTHER]) -class DropNullStage(PassThruTypeMixin, SinglePortStage): +@register_stage("dropna") +class DropNullStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Drop null data entries from a DataFrame. @@ -51,27 +49,26 @@ def __init__(self, c: Config, column: str): def name(self) -> str: return "dropna" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Accepted input types for this stage are returned. Returns ------- - typing.Tuple + tuple Accepted input types. """ return (MessageMeta, ) - def supports_cpp_node(self): - # Enable support by default + def supports_cpp_node(self) -> bool: return False def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: - def on_next(x: MessageMeta): - - y = MessageMeta(x.df[~x.df[self._column].isna()]) + def on_next(msg: MessageMeta): + df = msg.copy_dataframe() + y = MessageMeta(df[~df[self._column].isna()]) return y diff --git a/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py b/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py index d69504dd27..e31f151068 100644 --- a/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/group_by_column_stage.py @@ -17,11 +17,12 @@ from morpheus.config import Config from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class GroupByColumnStage(PassThruTypeMixin, SinglePortStage): +class GroupByColumnStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Group the incoming message by a column in the DataFrame. diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py index 0a0d36b97e..d8f5debf28 100644 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/preprocess_base_stage.py @@ -13,7 +13,6 @@ # limitations under the License. import typing -from abc import abstractmethod import mrc from mrc.core import operators as ops @@ -38,7 +37,6 @@ class PreprocessBaseStage(ControlMessageStage): def __init__(self, c: Config): super().__init__(c) - self._preprocess_fn = None self._should_log_timestamps = True def accepted_types(self) -> typing.Tuple: @@ -49,24 +47,27 @@ def accepted_types(self) -> typing.Tuple: return (ControlMessage, ) def compute_schema(self, schema: StageSchema): - self._preprocess_fn = self._get_preprocess_fn() schema.output_schema.set_type(ControlMessage) - @abstractmethod def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: - pass + """ + This method should be implemented by any subclasses with a Python implementation. + """ + raise NotImplementedError("No Python implementation provided by this stage") - @abstractmethod def _get_preprocess_node(self, builder: mrc.Builder) -> mrc.SegmentObject: - pass + """ + This method should be implemented by any subclasses with a C++ implementation. + """ + raise NotImplementedError("No C++ implementation provided by this stage") def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: - assert self._preprocess_fn is not None, "Preprocess function not set" if self._build_cpp_node(): node = self._get_preprocess_node(builder) node.launch_options.pe_count = self._config.num_threads else: - node = builder.make_node(self.unique_name, ops.map(self._preprocess_fn)) + preprocess_fn = self._get_preprocess_fn() + node = builder.make_node(self.unique_name, ops.map(preprocess_fn)) builder.make_edge(input_node, node) diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py index e113958c4c..b3e6895ae0 100644 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/preprocess_fil_stage.py @@ -13,22 +13,12 @@ # limitations under the License. import logging -import typing -from functools import partial -import cupy as cp import mrc -import numpy as np -import pandas as pd -import cudf - -import morpheus._lib.messages as _messages -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage logger = logging.getLogger(__name__) @@ -59,62 +49,9 @@ def __init__(self, c: Config): def name(self) -> str: return "preprocess-fil" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return True - @staticmethod - def pre_process_batch(msg: ControlMessage, fea_len: int, fea_cols: typing.List[str]) -> ControlMessage: - """ - For FIL category usecases, this function performs pre-processing. - - Parameters - ---------- - msg : `morpheus.messages.ControlMessage` - Input rows received from Deserialized stage. - fea_len : int - Number features are being used in the inference. - fea_cols : typing.Tuple[str] - List of columns that are used as features. - - Returns - ------- - `morpheus.messages.ControlMessage` - - """ - try: - df: cudf.DataFrame = msg.payload().get_data(fea_cols) - except KeyError: - logger.exception("Requested feature columns does not exist in the dataframe.", exc_info=True) - raise - - # Extract just the numbers from each feature col. Not great to operate on x.meta.df here but the operations will - # only happen once. - for col in fea_cols: - if (df[col].dtype == np.dtype(str) or df[col].dtype == np.dtype(object)): - # If the column is a string, parse the number - df[col] = df[col].str.extract(r"(\d+)", expand=False).astype("float32") - elif (df[col].dtype != np.float32): - # Convert to float32 - df[col] = df[col].astype("float32") - - if (isinstance(df, pd.DataFrame)): - df = cudf.from_pandas(df) - - # Convert the dataframe to cupy the same way cuml does - data = cp.asarray(df.to_cupy()) - - count = data.shape[0] - - seg_ids = cp.zeros((count, 3), dtype=cp.uint32) - seg_ids[:, 0] = cp.arange(0, count, dtype=cp.uint32) - seg_ids[:, 2] = fea_len - 1 - - # We need the C++ impl of TensorMemory until #1646 is resolved - msg.tensors(_messages.TensorMemory(count=count, tensors={"input__0": data, "seq_ids": seg_ids})) - return msg - - def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: - return partial(PreprocessFILStage.pre_process_batch, fea_len=self._fea_length, fea_cols=self.features) - def _get_preprocess_node(self, builder: mrc.Builder): + import morpheus._lib.stages as _stages return _stages.PreprocessFILStage(builder, self.unique_name, self.features) diff --git a/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py b/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py index 1f92d97b8f..3a85af54cb 100644 --- a/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py +++ b/python/morpheus/morpheus/stages/preprocess/preprocess_nlp_stage.py @@ -12,62 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -import base64 -import json import logging -import typing -from functools import partial -import cupy as cp import mrc -import numpy as np -import cudf - -import morpheus._lib.messages as _messages -import morpheus._lib.stages as _stages from morpheus.cli.register_stage import register_stage from morpheus.cli.utils import MorpheusRelativePath from morpheus.cli.utils import get_package_relative_file from morpheus.config import Config from morpheus.config import PipelineModes -from morpheus.messages import ControlMessage from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage -from morpheus.utils.cudf_subword_helper import tokenize_text_series logger = logging.getLogger(__name__) -def cupyarray_to_base64(cupy_array): - array_bytes = cupy_array.get().tobytes() - array_shape = cupy_array.shape - array_dtype = str(cupy_array.dtype) - - # Create a dictionary to store bytes, shape, and dtype - encoded_dict = {'bytes': base64.b64encode(array_bytes).decode("utf-8"), 'shape': array_shape, 'dtype': array_dtype} - - # Convert dictionary to JSON string for storage - return json.dumps(encoded_dict) - - -def base64_to_cupyarray(base64_str): - # Convert JSON string back to dictionary - encoded_dict = json.loads(base64_str) - - # Extract bytes, shape, and dtype - array_bytes = base64.b64decode(encoded_dict['bytes']) - array_shape = tuple(encoded_dict['shape']) - array_dtype = encoded_dict['dtype'] - - # Convert bytes back to a NumPy array and reshape - np_array = np.frombuffer(array_bytes, dtype=array_dtype).reshape(array_shape) - - # Convert NumPy array to CuPy array - cp_array = cp.array(np_array) - - return cp_array - - @register_stage( "preprocess", modes=[PipelineModes.NLP], @@ -133,64 +91,11 @@ def __init__(self, def name(self) -> str: return "preprocess-nlp" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return True - @staticmethod - def pre_process_batch(message: ControlMessage, - vocab_hash_file: str, - do_lower_case: bool, - seq_len: int, - stride: int, - truncation: bool, - add_special_tokens: bool, - column: str) -> ControlMessage: - """ - For NLP category use cases, this function performs pre-processing. - - [parameters are the same as the original function] - - Returns - ------- - `morpheus.messages.ControlMessage` - - """ - with message.payload().mutable_dataframe() as mdf: - text_series = cudf.Series(mdf[column]) - - tokenized = tokenize_text_series(vocab_hash_file=vocab_hash_file, - do_lower_case=do_lower_case, - text_ser=text_series, - seq_len=seq_len, - stride=stride, - truncation=truncation, - add_special_tokens=add_special_tokens) - - del text_series - - # We need the C++ impl of TensorMemory until #1646 is resolved - message.tensors( - _messages.TensorMemory(count=tokenized.input_ids.shape[0], - tensors={ - "input_ids": tokenized.input_ids, - "input_mask": tokenized.input_mask, - "seq_ids": tokenized.segment_ids - })) - - message.set_metadata("inference_memory_params", {"inference_type": "nlp"}) - return message - - def _get_preprocess_fn(self) -> typing.Callable[[ControlMessage], ControlMessage]: - return partial(PreprocessNLPStage.pre_process_batch, - vocab_hash_file=self._vocab_hash_file, - do_lower_case=self._do_lower_case, - stride=self._stride, - seq_len=self._seq_length, - truncation=self._truncation, - add_special_tokens=self._add_special_tokens, - column=self._column) - def _get_preprocess_node(self, builder: mrc.Builder): + import morpheus._lib.stages as _stages return _stages.PreprocessNLPStage(builder, self.unique_name, self._vocab_hash_file, diff --git a/python/morpheus/morpheus/utils/column_info.py b/python/morpheus/morpheus/utils/column_info.py index 75119320e4..6fe8a1cb8f 100644 --- a/python/morpheus/morpheus/utils/column_info.py +++ b/python/morpheus/morpheus/utils/column_info.py @@ -22,7 +22,8 @@ import pandas as pd -import cudf +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type logger = logging.getLogger(f"morpheus.{__name__}") @@ -30,7 +31,7 @@ # Note(Devin): Proxying this for backwards compatibility. Had to move the primary definition to avoid circular imports. -def process_dataframe(df_in: typing.Union[pd.DataFrame, cudf.DataFrame], input_schema) -> pd.DataFrame: +def process_dataframe(df_in: DataFrameType, input_schema) -> pd.DataFrame: """ Processes a dataframe according to the given schema. @@ -83,7 +84,7 @@ def create_increment_col(df: pd.DataFrame, """ # Ensure we are pandas for this - if (isinstance(df, cudf.DataFrame)): + if (not isinstance(df, pd.DataFrame)): df = df.to_pandas() time_col = df[timestamp_column].fillna(pd.to_datetime(DEFAULT_DATE)) @@ -595,16 +596,16 @@ class PreparedDFInfo: Attributes ---------- - df : typing.Union[pd.DataFrame, cudf.DataFrame] + df : DataFrameType The prepared DataFrame. - columns_to_preserve : typing.List[str] + columns_to_preserve : list[str] A list of column names that are to be preserved. """ - df: typing.Union[pd.DataFrame, cudf.DataFrame] - columns_to_preserve: typing.List[str] + df: DataFrameType + columns_to_preserve: list[str] -def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], +def _json_flatten(df_input: DataFrameType, input_columns: dict[str, str], json_cols: list[str], preserve_re: re.Pattern = None): @@ -614,7 +615,7 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], Parameters ---------- - df_input : typing.Union[pd.DataFrame, cudf.DataFrame] + df_input : DataFrameType DataFrame to process. input_columns : dict[str, str] The final input columns that are needed for processing. All other columns will be removed @@ -625,7 +626,7 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], Returns ------- - typing.Union[pd.DataFrame, cudf.DataFrame] + DataFrameType The processed DataFrame. """ @@ -640,10 +641,9 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], # Check if we even have any JSON columns to flatten if (not df_input.columns.intersection(json_cols).empty): - convert_to_cudf = False + is_cudf = is_cudf_type(df_input) - if (isinstance(df_input, cudf.DataFrame)): - convert_to_cudf = True + if (is_cudf): df_input = df_input.to_pandas() json_normalized = [] @@ -672,7 +672,8 @@ def _json_flatten(df_input: typing.Union[pd.DataFrame, cudf.DataFrame], # Combine the original DataFrame with the normalized JSON columns df_input = pd.concat([df_input[columns_to_keep]] + json_normalized, axis=1) - if (convert_to_cudf): + if (is_cudf): + import cudf df_input = cudf.from_pandas(df_input).reset_index(drop=True) # Remove all columns that are not in the input columns list. Ensure the correct types diff --git a/python/morpheus/morpheus/utils/concat_df.py b/python/morpheus/morpheus/utils/concat_df.py index f709d46c10..1956f83730 100644 --- a/python/morpheus/morpheus/utils/concat_df.py +++ b/python/morpheus/morpheus/utils/concat_df.py @@ -14,13 +14,12 @@ import pandas as pd -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.utils.type_utils import is_cudf_type -def concat_dataframes(messages: list[ControlMessage] | list[MessageMeta]) -> pd.DataFrame: +def concat_dataframes(messages: list[ControlMessage | MessageMeta]) -> pd.DataFrame: """ Concatinate the DataFrame associated with the collected messages into a single Pandas DataFrame. @@ -43,7 +42,7 @@ def concat_dataframes(messages: list[ControlMessage] | list[MessageMeta]) -> pd. else: raise ValueError("Invalid message type") - if isinstance(df, cudf.DataFrame): + if is_cudf_type(df): df = df.to_pandas() all_meta.append(df) diff --git a/python/morpheus/morpheus/utils/module_utils.py b/python/morpheus/morpheus/utils/module_utils.py index f1aca63334..a250f1a650 100644 --- a/python/morpheus/morpheus/utils/module_utils.py +++ b/python/morpheus/morpheus/utils/module_utils.py @@ -21,12 +21,10 @@ from typing import Type import mrc -import pandas as pd from pydantic import BaseModel -import cudf - from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_pkg_from_obj logger = logging.getLogger(__name__) @@ -190,9 +188,9 @@ def merge_dictionaries(primary_dict, secondary_dict): } -def to_period_approximation(data_df: DataFrameType, period: str): +def to_period_approximation(data_df: DataFrameType, period: str) -> DataFrameType: """ - This function converts a cudf dataframe to a period approximation. + This function converts a dataframe to a period approximation. Parameters ---------- @@ -203,7 +201,7 @@ def to_period_approximation(data_df: DataFrameType, period: str): Returns ------- - cudf.DataFrame + DataFrame Period approximation of the input cudf/pandas dataframe. """ @@ -216,8 +214,8 @@ def to_period_approximation(data_df: DataFrameType, period: str): strptime_format = period_to_strptime[period] - df_mod = cudf if isinstance(data_df, cudf.DataFrame) else pd - data_df["period"] = df_mod.to_datetime(data_df["ts"].dt.strftime(strptime_format) + '-1', + df_pkg = get_df_pkg_from_obj(data_df) + data_df["period"] = df_pkg.to_datetime(data_df["ts"].dt.strftime(strptime_format) + '-1', format=f"{strptime_format}-%w") return data_df diff --git a/python/morpheus/morpheus/utils/schema_transforms.py b/python/morpheus/morpheus/utils/schema_transforms.py index 1cf8b65183..162a2064db 100644 --- a/python/morpheus/morpheus/utils/schema_transforms.py +++ b/python/morpheus/morpheus/utils/schema_transforms.py @@ -17,9 +17,12 @@ import pandas as pd -import cudf - from morpheus.utils.column_info import DataFrameInputSchema +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type + +if typing.TYPE_CHECKING: + import cudf logger = logging.getLogger(__name__) @@ -34,16 +37,16 @@ def process_dataframe( @typing.overload def process_dataframe( - df_in: cudf.DataFrame, + df_in: "cudf.DataFrame", input_schema: DataFrameInputSchema, -) -> cudf.DataFrame: +) -> "cudf.DataFrame": ... def process_dataframe( - df_in: typing.Union[pd.DataFrame, cudf.DataFrame], + df_in: DataFrameType, input_schema: DataFrameInputSchema, -) -> typing.Union[pd.DataFrame, cudf.DataFrame]: +) -> DataFrameType: """ Applies column transformations to the input dataframe as defined by the `input_schema`. @@ -72,10 +75,9 @@ def process_dataframe( output_df = pd.DataFrame() - convert_to_cudf = False - if (isinstance(df_in, cudf.DataFrame)): + is_cudf = is_cudf_type(df_in) + if (is_cudf): df_in = df_in.to_pandas() - convert_to_cudf = True # Iterate over the column info for ci in input_schema.column_info: @@ -94,7 +96,8 @@ def process_dataframe( output_df[match_columns] = df_in[match_columns] - if (convert_to_cudf): + if (is_cudf): + import cudf return cudf.from_pandas(output_df) return output_df diff --git a/python/morpheus/morpheus/utils/seed.py b/python/morpheus/morpheus/utils/seed.py index b016731fa6..d64cd6a6a4 100644 --- a/python/morpheus/morpheus/utils/seed.py +++ b/python/morpheus/morpheus/utils/seed.py @@ -20,17 +20,26 @@ import torch -def manual_seed(seed: int): +def manual_seed(seed: int, cpu_only: bool = False): """ - Manually see the random number generators for the stdlib, PyTorch, NumPy and CuPy + Manually see the random number generators for the Python standard lib, PyTorch, NumPy and CuPy + + Parameters + ---------- + seed : int + The seed value to use + cpu_only : bool, default = False + When set to True, CuPy and CUDA specific PyTorch settings are not set. """ random.seed(seed) np.random.seed(seed) - cp.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) # the "all" refers to all GPUs - torch.backends.cudnn.benchmark = False - torch.backends.cudnn.deterministic = True + if not cpu_only: + cp.random.seed(seed) + + torch.cuda.manual_seed_all(seed) # the "all" refers to all GPUs + + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True diff --git a/python/morpheus/morpheus/utils/type_aliases.py b/python/morpheus/morpheus/utils/type_aliases.py index a3e7ddeed0..0028c076fe 100644 --- a/python/morpheus/morpheus/utils/type_aliases.py +++ b/python/morpheus/morpheus/utils/type_aliases.py @@ -15,9 +15,26 @@ import typing -import pandas as pd +if typing.TYPE_CHECKING: + import cupy + import numpy + import pandas -import cudf + import cudf -DataFrameType = typing.Union[pd.DataFrame, cudf.DataFrame] -SeriesType = typing.Union[pd.Series, cudf.Series] +DataFrameModule = typing.Literal["cudf", "pandas"] +"""Valid DataFrame modules.""" + +DataFrameType = typing.Union["pandas.DataFrame", "cudf.DataFrame"] +"""Alias for pandas and cuDF DataFrame types.""" + +SeriesType = typing.Union["pandas.Series", "cudf.Series"] +"""Alias for pandas and cuDF Series types.""" + +NDArrayType = typing.Union["numpy.ndarray", "cupy.ndarray"] +"""Alias for NumPy and CuPy ndarray types.""" + +# Intentionally using `typing.Dict` instead of `dict` to avoid a Sphinx build error. +# https://github.com/nv-morpheus/Morpheus/issues/1956 +TensorMapType = typing.Dict[str, NDArrayType] +"""Alias for a dictionary of tensor names to tensors represented as either a NumPy or CuPy ndarray.""" diff --git a/python/morpheus/morpheus/utils/type_utils.py b/python/morpheus/morpheus/utils/type_utils.py index a3aefdde8d..95c870e30f 100644 --- a/python/morpheus/morpheus/utils/type_utils.py +++ b/python/morpheus/morpheus/utils/type_utils.py @@ -11,12 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Utility functions for working with types.""" import inspect import types import typing from collections import defaultdict +import numpy as np +import pandas as pd + +from morpheus.config import CppConfig +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_aliases import DataFrameType + # pylint: disable=invalid-name T_co = typing.TypeVar("T_co", covariant=True) @@ -162,3 +171,252 @@ def get_full_qualname(klass: type) -> str: if module == '__builtin__': return klass.__qualname__ return module + '.' + klass.__qualname__ + + +def df_type_str_to_exec_mode(df_type_str: DataFrameModule) -> ExecutionMode: + """ + Return the appropriate execution mode based on the DataFrame type string. + + Parameters + ---------- + df_type_str : `morpheus.utils.type_aliases.DataFrameModule` + The DataFrame type string. + + Returns + ------- + `morpheus.config.ExecutionMode` + The associated execution mode based on the DataFrame type string. + """ + if df_type_str == "cudf": + return ExecutionMode.GPU + if df_type_str == "pandas": + return ExecutionMode.CPU + + valid_values = ", ".join(typing.get_args(DataFrameModule)) + raise ValueError(f"Invalid DataFrame type string: {df_type_str}, valid values are: {valid_values}") + + +def exec_mode_to_df_type_str(execution_mode: ExecutionMode) -> DataFrameModule: + """ + Return the appropriate DataFrame type string based on the execution mode. + + Parameters + ---------- + execution_mode : `morpheus.config.ExecutionMode` + The execution mode. + + Returns + ------- + `morpheus.utils.type_aliases.DataFrameModule` + The associated DataFrame type string based on the execution mode. + """ + if execution_mode == ExecutionMode.GPU: + return "cudf" + + return "pandas" + + +def cpp_mode_to_exec_mode() -> ExecutionMode: + """ + Return the execution mode based on the configuration of the global `morpheus.config.CppConfig` singleton. + + Returns + ------- + `morpheus.config.ExecutionMode` + The execution mode. + """ + if CppConfig.get_should_use_cpp(): + return ExecutionMode.GPU + return ExecutionMode.CPU + + +def df_type_str_to_pkg(df_type_str: DataFrameModule) -> types.ModuleType: + """ + Import and return the appropriate DataFrame package based on the DataFrame type string. + + Parameters + ---------- + df_type_str : `morpheus.utils.type_aliases.DataFrameModule` + The DataFrame type string. + + Returns + ------- + `types.ModuleType` + The associated DataFrame package based on the DataFrame type string. + """ + if df_type_str == "cudf": + import cudf + return cudf + if df_type_str == "pandas": + return pd + + valid_values = ", ".join(typing.get_args(DataFrameModule)) + raise ValueError(f"Invalid DataFrame type string: {df_type_str}, valid values are: {valid_values}") + + +@typing.overload +def get_df_pkg(selector: DataFrameModule = None) -> types.ModuleType: + ... + + +@typing.overload +def get_df_pkg(selector: ExecutionMode = None) -> types.ModuleType: + ... + + +def get_df_pkg(selector: ExecutionMode | DataFrameModule = None) -> types.ModuleType: + """ + Return the appropriate DataFrame package based on `selector` which can be either an `ExecutionMode` instance, a + DataFrame type string, or `None`. + + When `None` the execution mode is determined by the global `morpheus.config.CppConfig` singleton. + + This method is best used within code which needs to operate in both CPU and GPU modes, where simply importing `cudf` + would cause an import error if the user is not using a GPU. + Example usage:: + + from morpheus.utils.type_utils import get_df_pkg + df_pkg = get_df_pkg() + ser = df_pkg.Series([1,2,3]) + + Parameters + ---------- + selector : `morpheus.utils.type_aliases.DataFrameModule` | `morpheus.config.ExecutionMode` | None, optional + The selector to determine the DataFrame package, by default None. + + Returns + ------- + `types.ModuleType` + The associated DataFrame package based on the selector. + """ + if selector is None: + execution_mode = cpp_mode_to_exec_mode() + elif not isinstance(selector, ExecutionMode): + execution_mode = df_type_str_to_exec_mode(selector) + else: + execution_mode = selector + + if execution_mode == ExecutionMode.GPU: + import cudf + return cudf + + return pd + + +@typing.overload +def get_df_class(selector: DataFrameModule = None) -> type[DataFrameType]: + ... + + +@typing.overload +def get_df_class(selector: ExecutionMode = None) -> type[DataFrameType]: + ... + + +def get_df_class(selector: ExecutionMode | DataFrameModule = None) -> type[DataFrameType]: + """ + Return the appropriate DataFrame `selector` which can be either an `ExecutionMode` instance, a + DataFrame type string, or `None`. + + When `None` the execution mode is determined by the global `morpheus.config.CppConfig` singleton. + + This method is best used within code which needs to construct a DataFrame in both CPU and GPU modes. + Example usage:: + + from morpheus.utils.type_utils import get_df_class + df_class = get_df_class() + df = df_class({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + + Parameters + ---------- + selector : `morpheus.utils.type_aliases.DataFrameModule` | `morpheus.config.ExecutionMode` | None, optional + The selector to determine the DataFrame class, by default None. + + Returns + ------- + `type[DataFrameType]` + """ + df_pkg = get_df_pkg(selector) + return df_pkg.DataFrame + + +def is_cudf_type(obj: typing.Any) -> bool: + """ + Check if a given object (DataFrame, Series, RangeIndex etc...) is a cuDF type. + + Parameters + ---------- + obj : `typing.Any` + The object to check. + + Returns + ------- + `bool` + `True` if the object is a cuDF type, `False` otherwise. + """ + return "cudf" in str(type(obj)) + + +def get_df_pkg_from_obj(obj: typing.Any) -> types.ModuleType: + """ + Return the appropriate DataFrame package based on a given object (DataFrame, Series, RangeIndex etc...). + + Parameters + ---------- + obj : `typing.Any` + The object to check. + + Returns + ------- + `types.ModuleType` + The associated DataFrame package based on the object. + """ + if is_cudf_type(obj): + import cudf + return cudf + + return pd + + +def is_dataframe(obj: typing.Any) -> bool: + """ + Check if a given object is a pandas or cudf DataFrame. + + Parameters + ---------- + obj : `typing.Any` + The object to check. + + Returns + ------- + `bool` + `True` if the object is a DataFrame, `False` otherwise. + """ + df_pkg = get_df_pkg_from_obj(obj) + return isinstance(obj, df_pkg.DataFrame) + + +def get_array_pkg(execution_mode: ExecutionMode = None) -> types.ModuleType: + """ + Return the appropriate array package (CuPy for GPU, NumPy for CPU) based on the execution mode. + + When `None` the execution mode is determined by the global `morpheus.config.CppConfig` singleton. + + Parameters + ---------- + execution_mode : `morpheus.config.ExecutionMode`, optional + The execution mode, by default `None`. + + Returns + ------- + `types.ModuleType` + The associated array package based on the execution mode. + """ + if execution_mode is None: + execution_mode = cpp_mode_to_exec_mode() + + if execution_mode == ExecutionMode.GPU: + import cupy + return cupy + + return np diff --git a/python/morpheus_dfp/morpheus_dfp/messages/__init__.py b/python/morpheus_dfp/morpheus_dfp/messages/__init__.py deleted file mode 100644 index 66061e580b..0000000000 --- a/python/morpheus_dfp/morpheus_dfp/messages/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py b/python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py deleted file mode 100644 index 49b8c98ba9..0000000000 --- a/python/morpheus_dfp/morpheus_dfp/messages/dfp_message_meta.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import dataclasses -import logging - -import pandas as pd - -from morpheus.messages.message_meta import MessageMeta - -logger = logging.getLogger(__name__) - - -@dataclasses.dataclass(init=False) -class DFPMessageMeta(MessageMeta, cpp_class=None): - """ - This class extends MessageMeta to also hold userid corresponding to batched metadata. - - Parameters - ---------- - df : pandas.DataFrame - Input rows in dataframe. - user_id : str - User id. - - """ - user_id: str - - def __init__(self, df: pd.DataFrame, user_id: str) -> None: - super().__init__(df) - self.user_id = user_id diff --git a/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py b/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py index 2a66a04d9d..c710d09f9f 100644 --- a/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py +++ b/python/morpheus_dfp/morpheus_dfp/modules/dfp_inference.py @@ -22,9 +22,9 @@ import cudf from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.model_cache import ModelCache from morpheus_dfp.utils.model_cache import ModelManager from morpheus_dfp.utils.module_ids import DFP_INFERENCE @@ -111,11 +111,11 @@ def process_task(control_message: ControlMessage) -> ControlMessage: output_df = cudf.concat([payload.df, results_df[results_cols]], axis=1) # Create an output message to allow setting meta + meta = MessageMeta(output_df) + meta.set_data('model_version', f"{model_cache.reg_model_name}:{model_cache.reg_model_version}") output_message = ControlMessage() - output_message.payload(DFPMessageMeta(output_df, user_id=user_id)) - - output_message.payload().set_data('model_version', - f"{model_cache.reg_model_name}:{model_cache.reg_model_version}") + output_message.payload(meta) + output_message.set_metadata("user_id", user_id) if logger.isEnabledFor(logging.DEBUG): load_model_duration = (post_model_time - start_time) * 1000.0 diff --git a/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py b/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py index 234855d9f1..6bc41d1d09 100644 --- a/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py +++ b/python/morpheus_dfp/morpheus_dfp/modules/dfp_training.py @@ -21,10 +21,10 @@ import cudf from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.models.dfencoder import AutoEncoder from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_utils import register_module -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.module_ids import DFP_TRAINING logger = logging.getLogger(f"morpheus.{__name__}") @@ -97,10 +97,9 @@ def on_data(control_message: ControlMessage) -> list[ControlMessage]: model.fit(train_df, epochs=epochs, validation_data=validation_df, run_validation=run_validation) logger.debug("Training AE model for user: '%s'... Complete.", user_id) - dfp_mm = DFPMessageMeta(cudf.from_pandas(final_df), user_id=user_id) - + meta = MessageMeta(cudf.from_pandas(final_df)) output_message = ControlMessage() - output_message.payload(dfp_mm) + output_message.payload(meta) output_message.set_metadata("user_id", user_id) output_message.set_metadata("model", model) output_message.set_metadata("train_scores_mean", 0.0) diff --git a/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py b/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py index f9233c6f89..7ef67ec88c 100644 --- a/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py +++ b/python/morpheus_dfp/morpheus_dfp/stages/dfp_rolling_window_stage.py @@ -22,11 +22,13 @@ import pandas as pd from mrc.core import operators as ops +import cudf + from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.cached_user_window import CachedUserWindow from morpheus_dfp.utils.logging_timer import log_time @@ -88,7 +90,7 @@ def supports_cpp_node(self): def accepted_types(self) -> typing.Tuple: """Input types accepted by this stage.""" - return (DFPMessageMeta, ) + return (ControlMessage, ) def compute_schema(self, schema: StageSchema): schema.output_schema.set_type(ControlMessage) @@ -115,13 +117,13 @@ def _get_user_cache(self, user_id: str) -> typing.Generator[CachedUserWindow, No # # When it returns, make sure to save # user_cache.save() - def _build_window(self, message: DFPMessageMeta) -> ControlMessage: + def _build_window(self, message: ControlMessage) -> ControlMessage: - user_id = message.user_id + user_id = message.get_metadata('user_id') with self._get_user_cache(user_id) as user_cache: - incoming_df = message.get_data() + incoming_df = message.payload().get_data().to_pandas() # existing_df = user_cache.df if (not user_cache.append_dataframe(incoming_df=incoming_df)): @@ -161,12 +163,12 @@ def _build_window(self, message: DFPMessageMeta) -> ControlMessage: # Otherwise return a new message response_msg = ControlMessage() - response_msg.payload(DFPMessageMeta(df=train_df, user_id=user_id)) + response_msg.payload(MessageMeta(df=cudf.DataFrame(train_df))) response_msg.set_metadata("user_id", user_id) return response_msg - def on_data(self, message: DFPMessageMeta) -> ControlMessage: + def on_data(self, message: ControlMessage) -> ControlMessage: """ Emits a new message containing the rolling window for the user if and only if the history requirments are met, returns `None` otherwise. @@ -180,10 +182,10 @@ def on_data(self, message: DFPMessageMeta) -> ControlMessage: log_info.set_log( ("Rolling window complete for %s in {duration:0.2f} ms. " "Input: %s rows from %s to %s. Output: %s rows from %s to %s"), - message.user_id, - len(message.df), - message.df[self._config.ae.timestamp_column_name].min(), - message.df[self._config.ae.timestamp_column_name].max(), + message.get_metadata('user_id'), + len(message.payload().df), + message.payload().df[self._config.ae.timestamp_column_name].min(), + message.payload().df[self._config.ae.timestamp_column_name].max(), result.payload().count, result.payload().get_data(self._config.ae.timestamp_column_name).min(), result.payload().get_data(self._config.ae.timestamp_column_name).max(), diff --git a/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py b/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py index 2a40b4521e..e88b3dfc49 100644 --- a/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py +++ b/python/morpheus_dfp/morpheus_dfp/stages/dfp_split_users_stage.py @@ -14,7 +14,6 @@ """Split messages into individual users and generic messages.""" import logging -import typing import mrc import numpy as np @@ -24,10 +23,11 @@ import cudf from morpheus.config import Config +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema from morpheus.utils.type_aliases import DataFrameType -from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.utils.logging_timer import log_time logger = logging.getLogger(f"morpheus.{__name__}") @@ -59,8 +59,8 @@ def __init__(self, c: Config, include_generic: bool, include_individual: bool, - skip_users: typing.List[str] = None, - only_users: typing.List[str] = None): + skip_users: list[str] = None, + only_users: list[str] = None): super().__init__(c) self._include_generic = include_generic @@ -69,25 +69,25 @@ def __init__(self, self._only_users = only_users if only_users is not None else [] # Map of user ids to total number of messages. Keeps indexes monotonic and increasing per user - self._user_index_map: typing.Dict[str, int] = {} + self._user_index_map: dict[str, int] = {} @property def name(self) -> str: """Stage name.""" return "dfp-split-users" - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: """Whether this stage supports a C++ node.""" return False - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """Input types accepted by this stage.""" return (cudf.DataFrame, pd.DataFrame) def compute_schema(self, schema: StageSchema): - schema.output_schema.set_type(DFPMessageMeta) + schema.output_schema.set_type(ControlMessage) - def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: + def extract_users(self, message: DataFrameType) -> list[ControlMessage]: """ Extract users from a message, splitting the incoming data into unique messages on a per-user basis, and potentially filtering data based on the user. @@ -101,7 +101,7 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: # Convert to pandas because cudf is slow at this message = message.to_pandas() - split_dataframes: typing.Dict[str, pd.DataFrame] = {} + split_dataframes: dict[str, pd.DataFrame] = {} # If we are skipping users, do that here if (len(self._skip_users) > 0): @@ -123,7 +123,8 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: user_df in message.groupby(self._config.ae.userid_column_name, sort=False) }) - output_messages: typing.List[DFPMessageMeta] = [] + output_messages: list[ControlMessage] = [] + rows_per_user: list[int] = [] for user_id in sorted(split_dataframes.keys()): @@ -138,7 +139,12 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: user_df.index = range(current_user_count, current_user_count + len(user_df)) self._user_index_map[user_id] = current_user_count + len(user_df) - output_messages.append(DFPMessageMeta(df=user_df, user_id=user_id)) + rows_per_user.append(len(user_df)) + meta = MessageMeta(cudf.DataFrame.from_pandas(user_df)) + cm_msg = ControlMessage() + cm_msg.payload(meta) + cm_msg.set_metadata("user_id", user_id) + output_messages.append(cm_msg) # logger.debug("Emitting dataframe for user '%s'. Start: %s, End: %s, Count: %s", # user, @@ -146,8 +152,6 @@ def extract_users(self, message: DataFrameType) -> typing.List[DFPMessageMeta]: # df_user[self._config.ae.timestamp_column_name].max(), # df_user[self._config.ae.timestamp_column_name].count()) - rows_per_user = [len(x.df) for x in output_messages] - if (len(output_messages) > 0): log_info.set_log( ("Batch split users complete. Input: %s rows from %s to %s. " diff --git a/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py b/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py index 9e3e2d904c..036e2c90eb 100644 --- a/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py +++ b/python/morpheus_dfp/morpheus_dfp/utils/config_generator.py @@ -17,7 +17,6 @@ from morpheus.cli.utils import get_package_relative_file from morpheus.config import Config from morpheus.config import ConfigAutoEncoder -from morpheus.config import CppConfig from morpheus.messages import ControlMessage from morpheus.utils.file_utils import load_labels_file from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE @@ -172,12 +171,8 @@ def generate_ae_config(source: str, timestamp_column_name: str, pipeline_batch_size: int = 0, edge_buffer_size: int = 0, - use_cpp: bool = False, num_threads: int = len(os.sched_getaffinity(0))): config = Config() - - CppConfig.set_should_use_cpp(use_cpp) - config.num_threads = num_threads if pipeline_batch_size > 0: diff --git a/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp b/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp index 10ead7fa5d..51323b8ef6 100644 --- a/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp +++ b/python/morpheus_llm/morpheus_llm/_lib/llm/module.cpp @@ -33,7 +33,6 @@ #include "morpheus/messages/control.hpp" // IWYU pragma: keep #include "morpheus/pybind11/json.hpp" // IWYU pragma: keep -#include "morpheus/utilities/cudf_util.hpp" #include "morpheus/utilities/json_types.hpp" #include "morpheus/version.hpp" @@ -70,9 +69,6 @@ PYBIND11_MODULE(llm, _module) )pbdoc"; - // Load the cudf helpers - CudfHelper::load(); - // Import the mrc coro module mrc::pymrc::import(_module, "mrc.core.coro"); diff --git a/python/morpheus_llm/morpheus_llm/llm/nodes/extracter_node.py b/python/morpheus_llm/morpheus_llm/llm/nodes/extracter_node.py index 8027ad8178..710b0fe1e2 100644 --- a/python/morpheus_llm/morpheus_llm/llm/nodes/extracter_node.py +++ b/python/morpheus_llm/morpheus_llm/llm/nodes/extracter_node.py @@ -17,6 +17,7 @@ import numpy as np +from morpheus.messages import MessageMeta from morpheus_llm.llm import LLMContext from morpheus_llm.llm import LLMNodeBase @@ -59,7 +60,9 @@ async def execute(self, context: LLMContext) -> LLMContext: # pylint: disable=i # Get the keys from the task input_keys: list[str] = typing.cast(list[str], context.task()["input_keys"]) - with context.message().payload().mutable_dataframe() as df: + meta: MessageMeta = context.message().get_metadata("llm_message_meta") + + with meta.mutable_dataframe() as df: input_dict: list[dict] = df[input_keys].to_dict(orient="list") input_dict = _array_to_list(input_dict) @@ -95,7 +98,8 @@ def get_input_names(self) -> list[str]: async def execute(self, context: LLMContext) -> LLMContext: # pylint: disable=invalid-overridden-method # Get the data from the DataFrame - with context.message().payload().mutable_dataframe() as df: + meta: MessageMeta = context.message().get_metadata("llm_message_meta") + with meta.mutable_dataframe() as df: input_dict: list[dict] = df[self._input_names].to_dict(orient="list") input_dict = _array_to_list(input_dict) diff --git a/python/morpheus_llm/morpheus_llm/llm/task_handlers/simple_task_handler.py b/python/morpheus_llm/morpheus_llm/llm/task_handlers/simple_task_handler.py index c2461200ad..baf0db0310 100644 --- a/python/morpheus_llm/morpheus_llm/llm/task_handlers/simple_task_handler.py +++ b/python/morpheus_llm/morpheus_llm/llm/task_handlers/simple_task_handler.py @@ -15,6 +15,7 @@ import logging from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus_llm.llm import LLMContext from morpheus_llm.llm import LLMTaskHandler @@ -48,7 +49,8 @@ async def try_handle(self, context: LLMContext) -> list[ControlMessage]: input_dict = context.get_inputs() - with context.message().payload().mutable_dataframe() as df: + meta: MessageMeta = context.message().get_metadata("llm_message_meta") + with meta.mutable_dataframe() as df: # Write the values to the dataframe for key, value in input_dict.items(): df[key] = value diff --git a/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py b/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py index 3c4e07c9ec..c9528f3c78 100644 --- a/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py +++ b/python/morpheus_llm/morpheus_llm/modules/output/write_to_vector_db.py @@ -21,13 +21,13 @@ from mrc.core import operators as ops from pydantic import ValidationError -import cudf - from morpheus.messages import ControlMessage from morpheus.utils.module_ids import MORPHEUS_MODULE_NAMESPACE from morpheus.utils.module_ids import WRITE_TO_VECTOR_DB from morpheus.utils.module_utils import ModuleLoaderFactory from morpheus.utils.module_utils import register_module +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_df_pkg_from_obj from morpheus_llm.modules.schemas.write_to_vector_db_schema import WriteToVDBSchema from morpheus_llm.service.vdb.milvus_client import DATA_TYPE_MAP from morpheus_llm.service.vdb.utils import VectorDBServiceFactory @@ -70,7 +70,7 @@ def preprocess_vdb_resources(service, recreate: bool, resource_schemas: dict): class AccumulationStats: msg_count: int last_insert_time: float - data: list[cudf.DataFrame] + data: list[DataFrameType] @register_module(WRITE_TO_VECTOR_DB, MORPHEUS_MODULE_NAMESPACE) @@ -144,12 +144,12 @@ def on_completed(): for key, accum_stats in accumulator_dict.items(): try: if accum_stats.data: - merged_df = cudf.concat(accum_stats.data) + df_pkg = get_df_pkg_from_obj(accum_stats.data[0]) + merged_df = df_pkg.concat(accum_stats.data) service.insert_dataframe(name=key, df=merged_df) final_df_references.append(accum_stats.data) except Exception as e: logger.error("Unable to upload dataframe entries to vector database: %s", e) - raise finally: # Close vector database service connection service.close() @@ -175,9 +175,6 @@ def on_data(msg: ControlMessage): df, msg_resource_target = extract_df(msg) if df is not None and not df.empty: - if (not isinstance(df, cudf.DataFrame)): - df = cudf.DataFrame(df) - df_size = len(df) current_time = time.time() @@ -202,7 +199,8 @@ def on_data(msg: ControlMessage): (current_time - accum_stats.last_insert_time) >= write_time_interval): if accum_stats.data: - merged_df = cudf.concat(accum_stats.data) + df_pkg = get_df_pkg_from_obj(accum_stats.data[0]) + merged_df = df_pkg.concat(accum_stats.data) # pylint: disable=not-a-mapping service.insert_dataframe(name=key, df=merged_df, **resource_kwargs) diff --git a/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py b/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py index 0197f3071d..8a31ed8085 100644 --- a/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py +++ b/python/morpheus_llm/morpheus_llm/service/vdb/faiss_vdb_service.py @@ -17,10 +17,7 @@ import time import typing -import pandas as pd - -import cudf - +from morpheus.utils.type_aliases import DataFrameType from morpheus_llm.error import IMPORT_ERROR_MESSAGE from morpheus_llm.service.vdb.vector_db_service import VectorDBResourceService from morpheus_llm.service.vdb.vector_db_service import VectorDBService @@ -81,13 +78,13 @@ def insert(self, data: list[list] | list[dict], **kwargs) -> dict: """ raise NotImplementedError("Insert operation is not supported in FAISS") - def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwargs) -> dict: + def insert_dataframe(self, df: DataFrameType, **kwargs) -> dict: """ Insert a dataframe entires into the vector database. Parameters ---------- - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted into the collection. **kwargs Extra keyword arguments specific to the vector database implementation. @@ -368,11 +365,7 @@ def create(self, name: str, overwrite: bool = False, **kwargs): """ raise NotImplementedError("create operation is not supported in FAISS") - def create_from_dataframe(self, - name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], - overwrite: bool = False, - **kwargs) -> None: + def create_from_dataframe(self, name: str, df: DataFrameType, overwrite: bool = False, **kwargs) -> None: """ Create collections in the vector database. @@ -380,7 +373,7 @@ def create_from_dataframe(self, ---------- name : str Name of the collection. - df : Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType The dataframe to create the collection from. overwrite : bool, optional Whether to overwrite the collection if it already exists. Default is False. @@ -416,8 +409,7 @@ def insert(self, name: str, data: list[list] | list[dict], **kwargs) -> dict[str raise NotImplementedError("create_from_dataframe operation is not supported in FAISS") - def insert_dataframe(self, name: str, df: typing.Union[cudf.DataFrame, pd.DataFrame], - **kwargs) -> dict[str, typing.Any]: + def insert_dataframe(self, name: str, df: DataFrameType, **kwargs) -> dict[str, typing.Any]: """ Converts dataframe to rows and insert to the vector database. @@ -425,7 +417,7 @@ def insert_dataframe(self, name: str, df: typing.Union[cudf.DataFrame, pd.DataFr ---------- name : str Name of the collection to be inserted. - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted in the collection. **kwargs Additional keyword arguments containing collection configuration. diff --git a/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py b/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py index 71df614b23..5c3f020aea 100644 --- a/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py +++ b/python/morpheus_llm/morpheus_llm/service/vdb/milvus_vector_db_service.py @@ -20,11 +20,10 @@ import typing from functools import wraps -import cudf - from morpheus.io.utils import cudf_string_cols_exceed_max_bytes from morpheus.io.utils import truncate_string_cols_by_bytes from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import is_cudf_type from morpheus_llm.error import IMPORT_ERROR_MESSAGE from morpheus_llm.service.vdb.vector_db_service import VectorDBResourceService from morpheus_llm.service.vdb.vector_db_service import VectorDBService @@ -327,7 +326,7 @@ def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) - logger.info("Skipped checking 'None' in the field: %s, with datatype: %s", field_name, dtype) needs_truncate = self._truncate_long_strings - if needs_truncate and isinstance(df, cudf.DataFrame): + if needs_truncate and is_cudf_type(df): # Cudf specific optimization, we can avoid a costly call to truncate_string_cols_by_bytes if all of the # string columns are already below the max length needs_truncate = cudf_string_cols_exceed_max_bytes(df, self._fields_max_length) @@ -336,7 +335,7 @@ def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) - column_names = [field.name for field in self._fields if not field.auto_id] collection_df = df[column_names] - if isinstance(collection_df, cudf.DataFrame): + if is_cudf_type(collection_df): collection_df = collection_df.to_pandas() if needs_truncate: @@ -728,7 +727,7 @@ def _build_schema_conf(self, df: DataFrameType) -> list[dict]: # Always add a primary key fields.append({"name": "pk", "dtype": pymilvus.DataType.INT64, "is_primary": True, "auto_id": True}) - if isinstance(df, cudf.DataFrame): + if is_cudf_type(df): df = df.to_pandas() # Loop over all of the columns of the first row and build the schema diff --git a/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py b/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py index 8f2d346f55..bbf0439028 100644 --- a/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py +++ b/python/morpheus_llm/morpheus_llm/service/vdb/vector_db_service.py @@ -17,9 +17,7 @@ from abc import ABC from abc import abstractmethod -import pandas as pd - -import cudf +from morpheus.utils.type_aliases import DataFrameType logger = logging.getLogger(__name__) @@ -50,13 +48,13 @@ def insert(self, data: list[list] | list[dict], **kwargs: dict[str, typing.Any]) pass @abstractmethod - def insert_dataframe(self, df: typing.Union[cudf.DataFrame, pd.DataFrame], **kwargs: dict[str, typing.Any]) -> dict: + def insert_dataframe(self, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict: """ Insert a dataframe into the vector database. Parameters ---------- - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted into the resource. **kwargs : dict[str, typing.Any] Extra keyword arguments specific to the vector database implementation. @@ -241,10 +239,7 @@ def insert(self, name: str, data: list[list] | list[dict], **kwargs: dict[str, t pass @abstractmethod - def insert_dataframe(self, - name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], - **kwargs: dict[str, typing.Any]) -> dict: + def insert_dataframe(self, name: str, df: DataFrameType, **kwargs: dict[str, typing.Any]) -> dict: """ Converts dataframe to rows and insert into the vector database resource. @@ -252,7 +247,7 @@ def insert_dataframe(self, ---------- name : str Name of the resource to be inserted. - df : typing.Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType Dataframe to be inserted. **kwargs : dict[str, typing.Any] Additional keyword arguments containing collection configuration. @@ -391,7 +386,7 @@ def create(self, name: str, overwrite: bool = False, **kwargs: dict[str, typing. @abstractmethod def create_from_dataframe(self, name: str, - df: typing.Union[cudf.DataFrame, pd.DataFrame], + df: DataFrameType, overwrite: bool = False, **kwargs: dict[str, typing.Any]) -> None: """ @@ -401,7 +396,7 @@ def create_from_dataframe(self, ---------- name : str Name of the resource. - df : Union[cudf.DataFrame, pd.DataFrame] + df : DataFrameType The dataframe to create the resource from. overwrite : bool, optional Whether to overwrite the resource if it already exists. Default is False. diff --git a/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py b/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py index 86c0717964..289e447afa 100644 --- a/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py +++ b/python/morpheus_llm/morpheus_llm/stages/llm/llm_engine_stage.py @@ -12,14 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +import functools import logging +import types import typing import mrc +from mrc.core import operators as ops -import morpheus_llm._lib.llm as _llm from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus_llm.llm import LLMEngine @@ -27,7 +31,7 @@ logger = logging.getLogger(__name__) -class LLMEngineStage(PassThruTypeMixin, SinglePortStage): +class LLMEngineStage(PassThruTypeMixin, GpuAndCpuMixin, SinglePortStage): """ Stage for executing an LLM engine within a Morpheus pipeline. @@ -49,27 +53,95 @@ def name(self) -> str: """Return the name of the stage""" return "llm-engine" - def accepted_types(self) -> typing.Tuple: + def accepted_types(self) -> tuple: """ Returns accepted input types for this stage. Returns ------- - typing.Tuple(`ControlMessage`, ) + tuple(`ControlMessage`, ) Accepted input types. """ return (ControlMessage, ) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: """Indicates whether this stage supports a C++ node.""" return True + def _store_payload(self, message: ControlMessage) -> ControlMessage: + """ + Store the MessageMeta in the ControlMessage's metadata. + + In CPU-only allows the ControlMessage to hold an instance of a Python MessageMeta containing a pandas DataFrame. + """ + message.set_metadata("llm_message_meta", message.payload()) + return message + + def _copy_tasks_and_metadata(self, + src: ControlMessage, + dst: ControlMessage, + metadata: dict[str, typing.Any] = None): + if metadata is None: + metadata = src.get_metadata() + + for (key, value) in metadata.items(): + dst.set_metadata(key, value) + + tasks = src.get_tasks() + for (task, task_value) in tasks.items(): + for tv in task_value: + dst.add_task(task, tv) + + def _cast_to_cpp_control_message(self, py_message: ControlMessage, *, + cpp_messages_lib: types.ModuleType) -> ControlMessage: + """ + LLMEngineStage does not contain a Python implementation, however it is capable of running in cpu-only mode. + This method is needed to create an instance of a C++ ControlMessage. + + This is different than casting from the Python bindings for the C++ ControlMessage to a C++ ControlMessage. + """ + cpp_message = cpp_messages_lib.ControlMessage() + self._copy_tasks_and_metadata(py_message, cpp_message) + + return cpp_message + + def _restore_payload(self, message: ControlMessage) -> ControlMessage: + """ + Pop llm_message_meta from the metadata and set it as the payload. + + In CPU-only mode this has the effect of converting the C++ ControlMessage back to a Python ControlMessage. + """ + metadata = message.get_metadata() + message_meta = metadata.pop("llm_message_meta") + + out_message = ControlMessage() + out_message.payload(message_meta) + + self._copy_tasks_and_metadata(message, out_message, metadata=metadata) + + return out_message + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: + import morpheus_llm._lib.llm as _llm + + store_payload_node = builder.make_node(f"{self.unique_name}-store-payload", ops.map(self._store_payload)) + builder.make_edge(input_node, store_payload_node) node = _llm.LLMEngineStage(builder, self.unique_name, self._engine) node.launch_options.pe_count = 1 - builder.make_edge(input_node, node) + if self._config.execution_mode == ExecutionMode.CPU: + import morpheus._lib.messages as _messages + cast_to_cpp_fn = functools.partial(self._cast_to_cpp_control_message, cpp_messages_lib=_messages) + cast_to_cpp_node = builder.make_node(f"{self.unique_name}-pre-msg-cast", ops.map(cast_to_cpp_fn)) + builder.make_edge(store_payload_node, cast_to_cpp_node) + builder.make_edge(cast_to_cpp_node, node) + + else: + builder.make_edge(store_payload_node, node) + + restore_payload_node = builder.make_node(f"{self.unique_name}-restore-payload", ops.map(self._restore_payload)) + builder.make_edge(node, restore_payload_node) - return node + return restore_payload_node diff --git a/tests/_utils/dataset_manager.py b/tests/_utils/dataset_manager.py index c6aeb09892..40202b4025 100644 --- a/tests/_utils/dataset_manager.py +++ b/tests/_utils/dataset_manager.py @@ -29,7 +29,9 @@ from _utils import assert_results from morpheus.io.deserializers import read_file_to_df from morpheus.utils import compare_df +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import SeriesType class DatasetManager: @@ -38,19 +40,19 @@ class DatasetManager: Parameters ---------- - df_type : typing.Literal['cudf', 'pandas'] + df_type : DataFrameTypeStr Type of DataFrame to return unless otherwise explicitly specified. """ - __df_cache: typing.Dict[typing.Tuple[typing.Literal['cudf', 'pandas'], str], DataFrameType] = {} + __df_cache: dict[tuple[DataFrameModule, str], DataFrameType] = {} # Values in `__instances` are instances of `DatasetLoader` - __instances: typing.Dict[typing.Literal['cudf', 'pandas'], typing.Any] = {} + __instances: dict[DataFrameModule, "DatasetManager"] = {} # Explicitly using __new__ instead of of an __init__ to implement this as a singleton for each dataframe type. # Initialization is also being performed here instead of an __init__ method as an __init__ method would be re-run # the __init__ on the singleton instance for each cache hit. - def __new__(cls, df_type: typing.Literal['cudf', 'pandas']): + def __new__(cls, df_type: DataFrameModule): """Returns the singleton instance of `DatasetManager` for the specified `df_type`.""" try: return cls.__instances[df_type] @@ -61,7 +63,7 @@ def __new__(cls, df_type: typing.Literal['cudf', 'pandas']): return instance @staticmethod - def get_alt_df_type(df_type: typing.Literal['cudf', 'pandas']) -> typing.Literal['cudf', 'pandas']: + def get_alt_df_type(df_type: DataFrameModule) -> DataFrameModule: """Returns the other possible df type.""" return 'cudf' if df_type == 'pandas' else 'pandas' @@ -71,7 +73,7 @@ def clear(self): def get_df(self, file_path: str, - df_type: typing.Literal['cudf', 'pandas'] = None, + df_type: DataFrameModule = None, no_cache: bool = False, **reader_kwargs) -> DataFrameType: """ @@ -123,9 +125,7 @@ def get_df(self, return df.copy(deep=True) - def __getitem__( - self, item: typing.Union[str, typing.Tuple[str], typing.Tuple[str, typing.Literal['cudf', - 'pandas']]]) -> DataFrameType: + def __getitem__(self, item: str | tuple[str] | tuple[str, DataFrameModule]) -> DataFrameType: """Implements `__getitem__` to allow for fetching DataFrames using the `[]` operator.""" if not isinstance(item, tuple): item = (item, ) @@ -172,7 +172,7 @@ def repeat(df: DataFrameType, repeat_count: int = 2, reset_index: bool = True) - return repeated_df @staticmethod - def replace_index(df: DataFrameType, replace_ids: typing.Dict[int, int]) -> DataFrameType: + def replace_index(df: DataFrameType, replace_ids: dict[int, int]) -> DataFrameType: """Return a new DataFrame's where we replace some index values with others.""" return df.rename(index=replace_ids) @@ -192,7 +192,7 @@ def dup_index(cls, df: DataFrameType, count: int = 1) -> DataFrameType: return cls.replace_index(df, replace_dict) @staticmethod - def _value_as_pandas(val: typing.Union[pd.DataFrame, cdf.DataFrame, cdf.Series], assert_is_pandas=True): + def _value_as_pandas(val: DataFrameType | SeriesType, assert_is_pandas=True): if (isinstance(val, (cdf.DataFrame, cdf.Series))): return val.to_pandas() @@ -202,7 +202,15 @@ def _value_as_pandas(val: typing.Union[pd.DataFrame, cdf.DataFrame, cdf.Series], return val @classmethod - def df_equal(cls, df_to_check: typing.Union[pd.DataFrame, cdf.DataFrame], val_to_check: typing.Any): + def _value_as_pandas_df(cls, val: DataFrameType | SeriesType, assert_is_pandas=True): + pval = cls._value_as_pandas(val, assert_is_pandas=assert_is_pandas) + if isinstance(pval, pd.Series): + pval = pval.to_frame() + + return pval + + @classmethod + def df_equal(cls, df_to_check: DataFrameType, val_to_check: typing.Any): """ Compare a DataFrame against a validation dataset which can either be a DataFrame, Series or CuPy array. Returns True if they are equal. @@ -224,7 +232,7 @@ def df_equal(cls, df_to_check: typing.Union[pd.DataFrame, cdf.DataFrame], val_to @classmethod def assert_df_equal(cls, - df_to_check: typing.Union[pd.DataFrame, cdf.DataFrame], + df_to_check: DataFrameType, val_to_check: typing.Any, assert_msg="Dataframes are not equal."): """ @@ -234,20 +242,14 @@ def assert_df_equal(cls, assert cls.df_equal(df_to_check=df_to_check, val_to_check=val_to_check), assert_msg @classmethod - def compare_df(cls, - dfa: typing.Union[pd.DataFrame, cdf.DataFrame], - dfb: typing.Union[pd.DataFrame, cdf.DataFrame], - **compare_args): + def compare_df(cls, dfa: DataFrameType, dfb: DataFrameType, **compare_args): """Wrapper for `morpheus.utils.compare_df.compare_df`.""" with warnings.catch_warnings(): # Ignore performance warnings from pandas triggered by the comparison warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) - return compare_df.compare_df(cls._value_as_pandas(dfa), cls._value_as_pandas(dfb), **compare_args) + return compare_df.compare_df(cls._value_as_pandas_df(dfa), cls._value_as_pandas_df(dfb), **compare_args) @classmethod - def assert_compare_df(cls, - dfa: typing.Union[pd.DataFrame, cdf.DataFrame], - dfb: typing.Union[pd.DataFrame, cdf.DataFrame], - **compare_args): + def assert_compare_df(cls, dfa: DataFrameType, dfb: DataFrameType, **compare_args): """Convenience method for calling `compare_df` and asserting that the results are equivalent.""" assert_results(cls.compare_df(dfa, dfb, **compare_args)) diff --git a/tests/_utils/inference_worker.py b/tests/_utils/inference_worker.py index 7470e474d5..29af5a0c02 100644 --- a/tests/_utils/inference_worker.py +++ b/tests/_utils/inference_worker.py @@ -26,8 +26,6 @@ class IW(inference_stage.InferenceWorker): """ def calc_output_dims(self, _): - # Intentionally calling the abc empty method for coverage - super().calc_output_dims(_) return (1, 2) def process(self, _: ControlMessage, __: typing.Callable[[TensorMemory], None]): diff --git a/tests/_utils/stages/check_pre_alloc.py b/tests/_utils/stages/check_pre_alloc.py index 0f871a78dd..c8217cc28b 100644 --- a/tests/_utils/stages/check_pre_alloc.py +++ b/tests/_utils/stages/check_pre_alloc.py @@ -21,11 +21,12 @@ from morpheus.common import typeid_to_numpy_str from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class CheckPreAlloc(PassThruTypeMixin, SinglePortStage): +class CheckPreAlloc(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Acts like add-class/add-scores in that it requests a preallocation, the node will assert that the preallocation occurred with the correct type. @@ -38,16 +39,16 @@ def __init__(self, c, probs_type): self._needed_columns.update({label: probs_type for label in c.class_labels}) @property - def name(self): + def name(self) -> str: return "check-prealloc" - def accepted_types(self): + def accepted_types(self) -> tuple: return (ControlMessage, ) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def _check_prealloc(self, msg: ControlMessage): + def _check_prealloc(self, msg: ControlMessage) -> ControlMessage: df = msg.payload().df for label in self._class_labels: assert label in df.columns diff --git a/tests/_utils/stages/control_message_pass_thru.py b/tests/_utils/stages/control_message_pass_thru.py index 659606d38c..cd3ba74e18 100644 --- a/tests/_utils/stages/control_message_pass_thru.py +++ b/tests/_utils/stages/control_message_pass_thru.py @@ -18,23 +18,24 @@ from mrc.core import operators as ops from morpheus.messages import ControlMessage +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class ControlMessagePassThruStage(PassThruTypeMixin, SinglePortStage): +class ControlMessagePassThruStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): @property def name(self) -> str: return "mm-pass-thru" - def accepted_types(self): + def accepted_types(self) -> tuple: return (ControlMessage, ) - def supports_cpp_node(self): + def supports_cpp_node(self) -> bool: return False - def on_data(self, message: ControlMessage): + def on_data(self, message: ControlMessage) -> ControlMessage: return message def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: diff --git a/tests/_utils/stages/conv_msg.py b/tests/_utils/stages/conv_msg.py index edd64f5384..637963e50a 100755 --- a/tests/_utils/stages/conv_msg.py +++ b/tests/_utils/stages/conv_msg.py @@ -15,23 +15,24 @@ import typing -import cupy as cp import mrc -import pandas as pd from mrc.core import operators as ops -import cudf - -import morpheus._lib.messages as _messages from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_utils import get_array_pkg +from morpheus.utils.type_utils import get_df_pkg +from morpheus.utils.type_utils import get_df_pkg_from_obj @register_stage("unittest-conv-msg", ignore_args=["expected_data"]) -class ConvMsg(SinglePortStage): +class ConvMsg(GpuAndCpuMixin, SinglePortStage): """ Simple test stage to convert a ControlMessage to a ControlMessage with probs tensor. Basically a cheap replacement for running an inference stage. @@ -45,17 +46,20 @@ class ConvMsg(SinglePortStage): def __init__(self, c: Config, - expected_data: typing.Union[pd.DataFrame, cudf.DataFrame] = None, + expected_data: DataFrameType = None, columns: typing.List[str] = None, order: str = 'K', probs_type: str = 'f4', empty_probs: bool = False): super().__init__(c) + self._df_pkg = get_df_pkg(c.execution_mode) + self._array_pkg = get_array_pkg(c.execution_mode) + if expected_data is not None: - assert isinstance(expected_data, (pd.DataFrame, cudf.DataFrame)) + assert isinstance(expected_data, self._df_pkg.DataFrame) - self._expected_data = expected_data + self._expected_data: DataFrameType | None = expected_data self._columns = columns self._order = order self._probs_type = probs_type @@ -76,20 +80,21 @@ def supports_cpp_node(self) -> bool: def _conv_message(self, message: ControlMessage) -> ControlMessage: if self._expected_data is not None: - if (isinstance(self._expected_data, cudf.DataFrame)): + df_pkg = get_df_pkg_from_obj(self._expected_data) + if (isinstance(self._expected_data, self._df_pkg.DataFrame)): df = self._expected_data.copy(deep=True) else: - df = cudf.DataFrame(self._expected_data) + df = df_pkg.DataFrame(self._expected_data) else: - df: cudf.DataFrame = message.payload().get_data(self._columns) # type: ignore + df: DataFrameType = message.payload().get_data(self._columns) # type: ignore if self._empty_probs: - probs = cp.zeros([len(df), 3], 'float') + probs = self._array_pkg.zeros([len(df), 3], 'float') else: - probs = cp.array(df.values, dtype=self._probs_type, copy=True, order=self._order) + probs = self._array_pkg.array(df.values, dtype=self._probs_type, copy=True, order=self._order) - message.tensors(_messages.TensorMemory(count=len(probs), tensors={'probs': probs})) + message.tensors(TensorMemory(count=len(probs), tensors={'probs': probs})) return message def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: diff --git a/tests/_utils/stages/dfp_length_checker.py b/tests/_utils/stages/dfp_length_checker.py index 1162a647f7..659c8c81ec 100755 --- a/tests/_utils/stages/dfp_length_checker.py +++ b/tests/_utils/stages/dfp_length_checker.py @@ -21,13 +21,14 @@ from morpheus.cli.register_stage import register_stage from morpheus.config import Config from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.atomic_integer import AtomicInteger @register_stage("unittest-dfp-length-check") -class DFPLengthChecker(PassThruTypeMixin, SinglePortStage): +class DFPLengthChecker(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Verifies that the incoming MessageMeta classes are of a specific length diff --git a/tests/_utils/stages/error_raiser.py b/tests/_utils/stages/error_raiser.py index 8923229ab2..f3e0d8b5e6 100644 --- a/tests/_utils/stages/error_raiser.py +++ b/tests/_utils/stages/error_raiser.py @@ -19,12 +19,13 @@ from mrc.core import operators as ops from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage from morpheus.utils.atomic_integer import AtomicInteger -class ErrorRaiserStage(PassThruTypeMixin, SinglePortStage): +class ErrorRaiserStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Stage that raises an exception in the on_data method """ diff --git a/tests/_utils/stages/in_memory_multi_source_stage.py b/tests/_utils/stages/in_memory_multi_source_stage.py index 1eb2d46092..7497aff7ed 100644 --- a/tests/_utils/stages/in_memory_multi_source_stage.py +++ b/tests/_utils/stages/in_memory_multi_source_stage.py @@ -18,11 +18,12 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.source_stage import SourceStage from morpheus.pipeline.stage_schema import StageSchema -class InMemoryMultiSourceStage(SourceStage): +class InMemoryMultiSourceStage(GpuAndCpuMixin, SourceStage): """ In memory multi-source stage for testing purposes, accepts a 2d array `data`. The first dimenion represents the number of output ports, and the second represents the data for each port, and diff --git a/tests/_utils/stages/in_memory_source_x_stage.py b/tests/_utils/stages/in_memory_source_x_stage.py index bd1256c07a..229e8e1a3a 100644 --- a/tests/_utils/stages/in_memory_source_x_stage.py +++ b/tests/_utils/stages/in_memory_source_x_stage.py @@ -18,11 +18,12 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.single_output_source import SingleOutputSource from morpheus.pipeline.stage_schema import StageSchema -class InMemSourceXStage(SingleOutputSource): +class InMemSourceXStage(GpuAndCpuMixin, SingleOutputSource): """ InMemorySourceStage subclass that emits whatever you give it and doesn't assume the source data is a dataframe. diff --git a/tests/_utils/stages/multi_port_pass_thru.py b/tests/_utils/stages/multi_port_pass_thru.py index 5454974870..5cffb47b2b 100644 --- a/tests/_utils/stages/multi_port_pass_thru.py +++ b/tests/_utils/stages/multi_port_pass_thru.py @@ -20,11 +20,12 @@ import mrc.core.operators as ops from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.stage import Stage -class MultiPortPassThruStage(PassThruTypeMixin, Stage): +class MultiPortPassThruStage(GpuAndCpuMixin, PassThruTypeMixin, Stage): def __init__(self, c: Config, num_ports: int): super().__init__(c) diff --git a/tests/_utils/stages/record_thread_id_stage.py b/tests/_utils/stages/record_thread_id_stage.py index d2d9a12a82..0a991c1706 100644 --- a/tests/_utils/stages/record_thread_id_stage.py +++ b/tests/_utils/stages/record_thread_id_stage.py @@ -19,11 +19,12 @@ import mrc from morpheus.config import Config +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage -class RecordThreadIdStage(PassThruTypeMixin, SinglePortStage): +class RecordThreadIdStage(GpuAndCpuMixin, PassThruTypeMixin, SinglePortStage): """ Forwarding stage that records the thread id of the progress engine """ diff --git a/tests/_utils/stages/split_stage.py b/tests/_utils/stages/split_stage.py index 4e816de6c0..c03db636fa 100644 --- a/tests/_utils/stages/split_stage.py +++ b/tests/_utils/stages/split_stage.py @@ -20,11 +20,12 @@ from morpheus.config import Config from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin from morpheus.pipeline.stage import Stage from morpheus.pipeline.stage_schema import StageSchema -class SplitStage(Stage): +class SplitStage(GpuAndCpuMixin, Stage): def __init__(self, c: Config): super().__init__(c) diff --git a/tests/benchmarks/test_bench_agents_simple_pipeline.py b/tests/benchmarks/test_bench_agents_simple_pipeline.py index cbd83e3cae..ffad11dc78 100644 --- a/tests/benchmarks/test_bench_agents_simple_pipeline.py +++ b/tests/benchmarks/test_bench_agents_simple_pipeline.py @@ -97,7 +97,7 @@ def _run_pipeline(config: Config, source_dfs: list[cudf.DataFrame], model_name: @pytest.mark.usefixtures("openai", "restore_environ") -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.benchmark @mock.patch("langchain.utilities.serpapi.SerpAPIWrapper.aresults") @mock.patch("langchain.OpenAI._agenerate", autospec=True) # autospec is needed as langchain will inspect the function diff --git a/tests/benchmarks/test_bench_completion_pipeline.py b/tests/benchmarks/test_bench_completion_pipeline.py index 20f921d228..c45f3ecd9c 100644 --- a/tests/benchmarks/test_bench_completion_pipeline.py +++ b/tests/benchmarks/test_bench_completion_pipeline.py @@ -74,7 +74,7 @@ def _run_pipeline(config: Config, @pytest.mark.use_cudf -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.benchmark @pytest.mark.usefixtures("mock_nemollm", "mock_chat_completion") @pytest.mark.parametrize("llm_service_cls", [NeMoLLMService, OpenAIChatService]) diff --git a/tests/benchmarks/test_bench_rag_standalone_pipeline.py b/tests/benchmarks/test_bench_rag_standalone_pipeline.py index 8f531326a8..e394eaa331 100644 --- a/tests/benchmarks/test_bench_rag_standalone_pipeline.py +++ b/tests/benchmarks/test_bench_rag_standalone_pipeline.py @@ -121,7 +121,7 @@ def _run_pipeline(config: Config, @pytest.mark.milvus -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.use_cudf @pytest.mark.benchmark @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'llm/common/utils.py')) diff --git a/tests/benchmarks/test_bench_vdb_upload_pipeline.py b/tests/benchmarks/test_bench_vdb_upload_pipeline.py index f7864fb779..51ae9842a1 100644 --- a/tests/benchmarks/test_bench_vdb_upload_pipeline.py +++ b/tests/benchmarks/test_bench_vdb_upload_pipeline.py @@ -87,7 +87,7 @@ def _run_pipeline(config: Config, @pytest.mark.milvus -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.use_pandas @pytest.mark.benchmark @pytest.mark.import_mod([ diff --git a/tests/conftest.py b/tests/conftest.py index 55c3b03605..3dca6bc243 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,6 +37,9 @@ from _utils.kafka import kafka_consumer_fixture # noqa: F401 pylint:disable=unused-import from _utils.kafka import kafka_topics_fixture # noqa: F401 pylint:disable=unused-import +if typing.TYPE_CHECKING: + from morpheus.config import ExecutionMode + # Don't let pylint complain about pytest fixtures # pylint: disable=redefined-outer-name,unused-argument @@ -108,32 +111,11 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): supports """ - # === use_cpp Parameterize === - use_cpp = metafunc.definition.get_closest_marker("use_cpp") is not None - use_python = metafunc.definition.get_closest_marker("use_python") is not None - - use_cpp_param = pytest.param(True, marks=pytest.mark.use_cpp(added_by="generate_tests"), id="use_cpp") - use_python_param = pytest.param(False, marks=pytest.mark.use_python(added_by="generate_tests"), id="use_python") - - _set_use_cpp_params = [] - - if ("use_cpp" in metafunc.fixturenames): - # Need to add some params since the fixture was requested - - # Add cpp unless use_cpp == True and use_python == False - if not (use_python and not use_cpp): - _set_use_cpp_params.append(use_cpp_param) - - # Add python unless use_cpp == False and use_python == True - if not (not use_python and use_cpp): - _set_use_cpp_params.append(use_python_param) - - elif (use_cpp and use_python): - # Need to parameterize since we have multiple - _set_use_cpp_params.extend([use_cpp_param, use_python_param]) - - if (len(_set_use_cpp_params) > 0): - metafunc.parametrize("_set_use_cpp", _set_use_cpp_params, indirect=True) + # A test can request a fixture by placing it in the function arguments, or with a mark + if ("gpu_and_cpu_mode" in metafunc.fixturenames or metafunc.definition.get_closest_marker("gpu_and_cpu_mode")): + gpu_mode_param = pytest.param(True, marks=pytest.mark.gpu_mode(added_by="generate_tests"), id="gpu_mode") + cpu_mode_param = pytest.param(False, marks=pytest.mark.cpu_mode(added_by="generate_tests"), id="cpu_mode") + metafunc.parametrize("execution_mode", [gpu_mode_param, cpu_mode_param], indirect=True) # === df_type Parameterize === if ("df_type" in metafunc.fixturenames): @@ -172,24 +154,23 @@ def pytest_runtest_setup(item): def pytest_collection_modifyitems(session: pytest.Session, config: pytest.Config, items: typing.List[pytest.Item]): """ - To support old unittest style tests, try to determine the mark from the name + Remove tests that are incompatible with the current configuration. """ if config.getoption("--run_kafka") and not PYTEST_KAFKA_AVAIL: raise RuntimeError(f"--run_kafka requested but pytest_kafka not available due to: {PYTEST_KAFKA_ERROR}") - for item in items: - if "no_cpp" in item.nodeid and item.get_closest_marker("use_python") is None: - item.add_marker(pytest.mark.use_python(added_in="collection_modifyitems")) - elif "cpp" in item.nodeid and item.get_closest_marker("use_cpp") is None: - item.add_marker(pytest.mark.use_cpp(added_in="collection_modifyitems")) - def should_filter_test(item: pytest.Item): - use_cpp = item.get_closest_marker("use_cpp") + gpu_mode = item.get_closest_marker("gpu_mode") use_pandas = item.get_closest_marker("use_pandas") + use_cudf = item.get_closest_marker("use_cudf") + cpu_mode = item.get_closest_marker("cpu_mode") + + if (gpu_mode and use_pandas): + return False - if (use_cpp and use_pandas): + if (use_cudf and cpu_mode): return False return True @@ -212,113 +193,96 @@ def pytest_runtest_teardown(item, nextitem): reset_logging(logger_name=None) # Reset the root logger as well -# This fixture will be used by all tests. -@pytest.fixture(scope="function", autouse=True) -def _set_use_cpp(request: pytest.FixtureRequest): +@pytest.fixture(scope="function") +def df_type(request: pytest.FixtureRequest): - do_use_cpp: bool = True + df_type_str: typing.Literal["cudf", "pandas"] # Check for the param if this was indirectly set - if (hasattr(request, "param") and isinstance(request.param, bool)): - do_use_cpp = request.param + if (hasattr(request, "param")): + assert request.param in ["pandas", "cudf"], "Invalid parameter for df_type" + + df_type_str = request.param else: # If not, check for the marker and use that - use_cpp = request.node.get_closest_marker("use_cpp") is not None - use_python = request.node.get_closest_marker("use_python") is not None + use_pandas = request.node.get_closest_marker("use_pandas") is not None + use_cudf = request.node.get_closest_marker("use_cudf") is not None - if (use_cpp and use_python): - raise RuntimeError(f"Both markers (use_cpp and use_python) were added to function {request.node.nodeid}. " + if (use_pandas and use_cudf): + raise RuntimeError(f"Both markers (use_pandas and use_cudf) were added to function {request.node.nodeid}. " "Remove markers to support both.") - # This will default to True or follow use_cpp - do_use_cpp = not use_python - - from morpheus.config import CppConfig + # This will default to "cudf" or follow use_pandas + df_type_str = "cudf" if not use_pandas else "pandas" - CppConfig.set_should_use_cpp(do_use_cpp) + yield df_type_str - yield do_use_cpp +def _get_execution_mode(request: pytest.FixtureRequest) -> "ExecutionMode": + do_gpu_mode: bool = True -# This fixture will be used by all tests. -@pytest.fixture(scope="function") -def use_cpp(_set_use_cpp: bool): + # Check for the param if this was indirectly set + if (hasattr(request, "param") and isinstance(request.param, bool)): + do_gpu_mode = request.param + else: + # If not, check for the marker and use that + gpu_mode = request.node.get_closest_marker("gpu_mode") is not None + cpu_mode = request.node.get_closest_marker("cpu_mode") is not None - # Just return the set value - yield _set_use_cpp + if (gpu_mode and cpu_mode): + raise RuntimeError(f"Both markers (gpu_mode and cpu_mode) were added to function {request.node.nodeid}. " + "Use the gpu_and_cpu_mode marker to test both.") + # if both are undefined, infer based on the df_type + if (not gpu_mode and not cpu_mode): + cpu_mode = request.node.get_closest_marker("use_pandas") is not None -@pytest.fixture(scope="function") -def config_only_cpp(): - """ - Use this fixture in unittest style tests to indicate a lack of support for C++. Use via - `@pytest.mark.usefixtures("config_only_cpp")` - """ + # This will default to True or follow gpu_mode + do_gpu_mode = not cpu_mode - from morpheus.config import Config - from morpheus.config import CppConfig + from morpheus.config import ExecutionMode + if do_gpu_mode: + return ExecutionMode.GPU - CppConfig.set_should_use_cpp(True) + return ExecutionMode.CPU - yield Config() +@pytest.fixture(name="execution_mode", scope="function", autouse=True) +def execution_mode_fixture(request: pytest.FixtureRequest): + exec_mode = _get_execution_mode(request) + yield exec_mode -@pytest.fixture(scope="function") -def config_no_cpp(): - """ - Use this fixture in unittest style tests to indicate support for C++. Use via - `@pytest.mark.usefixtures("config_no_cpp")` - """ - from morpheus.config import Config +# This fixture will be used by all tests. +@pytest.fixture(scope="function", autouse=True) +def _set_use_cpp(request: pytest.FixtureRequest): + execution_mode = _get_execution_mode(request) from morpheus.config import CppConfig - CppConfig.set_should_use_cpp(False) - - yield Config() - - -@pytest.fixture(scope="function") -def df_type(request: pytest.FixtureRequest): - - df_type_str: typing.Literal["cudf", "pandas"] - - # Check for the param if this was indirectly set - if (hasattr(request, "param")): - assert request.param in ["pandas", "cudf"], "Invalid parameter for df_type" - - df_type_str = request.param - else: - # If not, check for the marker and use that - use_pandas = request.node.get_closest_marker("use_pandas") is not None - use_cudf = request.node.get_closest_marker("use_cudf") is not None - - if (use_pandas and use_cudf): - raise RuntimeError(f"Both markers (use_cpp and use_python) were added to function {request.node.nodeid}. " - "Remove markers to support both.") - - # This will default to "cudf" or follow use_pandas - df_type_str = "cudf" if not use_pandas else "pandas" + do_use_cpp: bool = (execution_mode.value == "GPU") + CppConfig.set_should_use_cpp(do_use_cpp) - yield df_type_str + yield do_use_cpp @pytest.fixture(scope="function") -def config(use_cpp: bool): +def config(execution_mode: "ExecutionMode"): """ For new pytest style tests, get the config by using this fixture. It will setup the config based on the marks set on the object. If no marks are added to the test, it will be parameterized for both C++ and python. For example, ``` - @pytest.mark.use_python + @pytest.mark.cpu_mode def my_python_test(config: Config): ... ``` """ from morpheus.config import Config + config = Config() + config.execution_mode = execution_mode - yield Config() + yield config @pytest.fixture(scope="function") @@ -902,33 +866,11 @@ def test_something(dataset: DatasetManager): ``` A test that requests this fixture will parameterize on the type of DataFrame returned by the DatasetManager. - If a test requests both this fixture and the `use_cpp` fixture, or indirectly via the `config` fixture, then - the test will parameterize over both df_type:[cudf, pandas] and use_cpp[True, False]. However it will remove the - df_type=pandas & use_cpp=True combinations as this will cause an unsupported usage of Pandas dataframes with the - C++ implementation of message classes. + If a test requests both this fixture and is marked either `gpu_mode` or `cpu_mode` then only cudf or pandas will be + used to prevent an unsupported usage of Pandas dataframes with the C++ implementation of message classes, and cuDF + with CPU-only implementations. - This behavior can also be overridden by using the `use_cudf`, `use_pandas`, `use_cpp` or `use_pandas` marks ex: - ``` - # This test will only run once with C++ enabled and cudf dataframes - @pytest.mark.use_cpp - def test something(dataset: DatasetManager): - ... - # This test will run once for each dataframe type, with C++ disabled both times - @pytest.mark.use_python - import sysdf dataframes both times - @pytest.mark.use_cudf - def test something(use_cpp: bool, dataset: DatasetManager): - ... - # This test will run only once - @pytest.mark.use_cudf - @pytest.mark.use_python - def test something(dataset: DatasetManager): - ... - # This test creates an incompatible combination and will raise a RuntimeError without being executed - @pytest.mark.use_pandas - @pytest.mark.use_cpp - def test something(dataset: DatasetManager): - ``` + Similarly the `use_cudf`, `use_pandas` marks will also prevent parametarization over the DataFrame type. Users who don't want to parametarize over the DataFrame should use the `dataset_pandas` or `dataset_cudf` fixtures. """ @@ -948,7 +890,7 @@ def dataset_pandas(): In addition to this, users can use this fixture to explicitly request a cudf Dataframe as well, allowing for a test that looks like: ``` - @pytest.mark.use_cpp + @pytest.mark.gpu_mode def test_something(dataset_pandas: DatasetManager): input_df = dataset_pandas.cudf["filter_probs.csv"] # Feed our source stage a cudf DF @@ -976,12 +918,12 @@ def test_something(dataset_cudf: DatasetManager): @pytest.fixture(scope="function") -def filter_probs_df(dataset, use_cpp: bool): +def filter_probs_df(dataset): """ Shortcut fixture for loading the filter_probs.csv dataset. Unless your test uses the `use_pandas` or `use_cudf` marks this fixture will parametarize over the two dataframe - types. Similarly unless your test uses the `use_cpp` or `use_python` marks this fixture will also parametarize over + types. Similarly unless your test uses the `gpu_mode` or `cpu_mode` marks this fixture will also parametarize over that as well, while excluding the combination of C++ execution and Pandas dataframes. """ yield dataset["filter_probs.csv"] @@ -1179,6 +1121,18 @@ def mock_nemollm_fixture(): yield mock_nemollm +@pytest.fixture(name="array_pkg") +def array_pkg_fixture(execution_mode: "ExecutionMode") -> types.ModuleType: + from morpheus.utils.type_utils import get_array_pkg + return get_array_pkg(execution_mode) + + +@pytest.fixture(name="df_pkg") +def df_pkg_fixture(execution_mode: "ExecutionMode") -> types.ModuleType: + from morpheus.utils.type_utils import get_df_pkg + return get_df_pkg(execution_mode) + + @pytest.fixture(name="mock_subscription") def mock_subscription_fixture(): """ diff --git a/tests/examples/developer_guide/test_pass_thru.py b/tests/examples/developer_guide/test_pass_thru.py index f98451f318..426b30eae2 100644 --- a/tests/examples/developer_guide/test_pass_thru.py +++ b/tests/examples/developer_guide/test_pass_thru.py @@ -19,40 +19,52 @@ import pytest from _utils import TEST_DIRS +from _utils import assert_results from morpheus.config import Config -from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta +from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage +from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage +from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage from morpheus.utils.type_aliases import DataFrameType -def _check_pass_thru(config: Config, - filter_probs_df: DataFrameType, - pass_thru_stage_cls: SinglePortStage, - on_data_fn_name: str = 'on_data'): - stage = pass_thru_stage_cls(config) - assert isinstance(stage, SinglePortStage) +def _check_pass_thru(config: Config, filter_probs_df: DataFrameType, pass_thru_stage_cls: SinglePortStage): + pass_thru_stage = pass_thru_stage_cls(config) + assert isinstance(pass_thru_stage, SinglePortStage) - meta = MessageMeta(filter_probs_df) - msg = ControlMessage() - msg.payload(meta) + pipe = LinearPipeline(config) + pipe.set_source(InMemorySourceStage(config, dataframes=[filter_probs_df.copy(deep=True)])) + sink_1 = pipe.add_stage(InMemorySinkStage(config)) + pipe.add_stage(pass_thru_stage) + sink_2 = pipe.add_stage(InMemorySinkStage(config)) + comp_stage = pipe.add_stage(CompareDataFrameStage(config, filter_probs_df.copy(deep=True))) + pipe.run() - on_data_fn = getattr(stage, on_data_fn_name) - assert on_data_fn(msg) is msg + assert_results(comp_stage.get_results()) + in_messages = sink_1.get_messages() + assert len(in_messages) == 1 + out_messages = sink_2.get_messages() + assert len(out_messages) == 1 + assert in_messages[0] is out_messages[0] + +@pytest.mark.gpu_and_cpu_mode @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'developer_guide/1_simple_python_stage/pass_thru.py')) def test_pass_thru_ex1(config: Config, filter_probs_df: DataFrameType, import_mod: types.ModuleType): pass_thru = import_mod _check_pass_thru(config, filter_probs_df, pass_thru.PassThruStage) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'developer_guide/1_simple_python_stage/pass_thru_deco.py')) def test_pass_thru_ex1_deco(config: Config, filter_probs_df: DataFrameType, import_mod: types.ModuleType): pass_thru = import_mod - _check_pass_thru(config, filter_probs_df, pass_thru.pass_thru_stage, on_data_fn_name='_on_data_fn') + _check_pass_thru(config, filter_probs_df, pass_thru.pass_thru_stage) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.import_mod( os.path.join(TEST_DIRS.examples_dir, 'developer_guide/3_simple_cpp_stage/src/simple_cpp_stage/pass_thru.py')) def test_pass_thru_ex3(config: Config, filter_probs_df: DataFrameType, import_mod: types.ModuleType): diff --git a/tests/examples/gnn_fraud_detection_pipeline/conftest.py b/tests/examples/gnn_fraud_detection_pipeline/conftest.py index 1ab1cc7544..e8f80e7054 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/conftest.py +++ b/tests/examples/gnn_fraud_detection_pipeline/conftest.py @@ -44,7 +44,7 @@ def cuml_fixture(fail_missing: bool): @pytest.fixture(name="config") -def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config): """ The GNN fraud detection pipeline utilizes the "other" pipeline mode. """ diff --git a/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py b/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py index c597c430ca..de0de0826e 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py +++ b/tests/examples/gnn_fraud_detection_pipeline/test_classification_stage.py @@ -25,7 +25,7 @@ # pylint: disable=no-name-in-module -@pytest.mark.use_python +@pytest.mark.gpu_mode class TestClassificationStage: def test_constructor(self, config: Config, xgb_model: str, cuml: types.ModuleType): diff --git a/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py b/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py index ee278ef549..d7a8f47e8e 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py +++ b/tests/examples/gnn_fraud_detection_pipeline/test_graph_construction_stage.py @@ -28,7 +28,7 @@ # pylint: disable=no-name-in-module -@pytest.mark.use_python +@pytest.mark.gpu_mode class TestGraphConstructionStage: def test_constructor(self, config: Config, training_file: str): diff --git a/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py b/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py index f272098a7d..a4a5241a25 100644 --- a/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py +++ b/tests/examples/gnn_fraud_detection_pipeline/test_graph_sage_stage.py @@ -25,7 +25,7 @@ # pylint: disable=no-name-in-module @pytest.mark.usefixtures("manual_seed") -@pytest.mark.use_python +@pytest.mark.gpu_mode class TestGraphSageStage: def test_constructor(self, config: Config, model_dir: str): diff --git a/tests/examples/llm/common/test_content_extractor_module.py b/tests/examples/llm/common/test_content_extractor_module.py index 2c77737681..805f9b2b1f 100644 --- a/tests/examples/llm/common/test_content_extractor_module.py +++ b/tests/examples/llm/common/test_content_extractor_module.py @@ -88,8 +88,6 @@ def generate_random_string(length: int) -> str: return ''.join(random.choices(string.ascii_letters + string.digits, k=length)) -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.parametrize("data_len, num_rows_per_file, batch_size", [(40, 5, 2), (51, 3, 1), (150, 10, 5), (500, 3, 2), (1000, 5, 3), (50, 10, 2), (100, 20, 3), (50, 5, 1), (100, 10, 1), (49, 5, 2), (99, 5, 2), (60, 7, 2), (120, 6, 3), (1000, 50, 10), diff --git a/tests/examples/llm/common/test_web_scraper_module.py b/tests/examples/llm/common/test_web_scraper_module.py index 592f5d38fb..012cb45fa3 100644 --- a/tests/examples/llm/common/test_web_scraper_module.py +++ b/tests/examples/llm/common/test_web_scraper_module.py @@ -30,8 +30,6 @@ @pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'llm/vdb_upload/module/web_scraper_module.py')) def test_web_scraper_module(config: Config, mock_rest_server: str, import_mod: types.ModuleType): url = f"{mock_rest_server}/www/index" diff --git a/tests/examples/llm/common/test_web_scraper_stage.py b/tests/examples/llm/common/test_web_scraper_stage.py index 418d245043..6526c00df1 100644 --- a/tests/examples/llm/common/test_web_scraper_stage.py +++ b/tests/examples/llm/common/test_web_scraper_stage.py @@ -28,8 +28,6 @@ @pytest.mark.slow -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.import_mod(os.path.join(TEST_DIRS.examples_dir, 'llm/vdb_upload/module/web_scraper_stage.py')) def test_http_client_source_stage_pipe(config: Config, mock_rest_server: str, import_mod: types.ModuleType): url = f"{mock_rest_server}/www/index" diff --git a/tests/examples/llm/vdb_upload/test_schema_transform_module.py b/tests/examples/llm/vdb_upload/test_schema_transform_module.py index 8a4ed6e870..75dc7178a5 100644 --- a/tests/examples/llm/vdb_upload/test_schema_transform_module.py +++ b/tests/examples/llm/vdb_upload/test_schema_transform_module.py @@ -27,8 +27,6 @@ from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage -@pytest.mark.use_python -@pytest.mark.use_cudf @pytest.mark.parametrize("num_select, num_renames", [(1, 0), (0, 1), (1, 1), (6, 6), (13, 10), (10, 13)]) def test_schema_transform_module(num_select, num_renames, diff --git a/tests/examples/log_parsing/conftest.py b/tests/examples/log_parsing/conftest.py index f927c3fcc1..d31891873a 100644 --- a/tests/examples/log_parsing/conftest.py +++ b/tests/examples/log_parsing/conftest.py @@ -17,7 +17,7 @@ @pytest.fixture(name="config") -def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config): """ The log_parsing pipelie requires NLP mode. Set this here so all the tests don't need to set it themselves. """ diff --git a/tests/examples/log_parsing/test_inference.py b/tests/examples/log_parsing/test_inference.py index f4a7aac660..a721d8afc7 100644 --- a/tests/examples/log_parsing/test_inference.py +++ b/tests/examples/log_parsing/test_inference.py @@ -22,10 +22,10 @@ import numpy as np import pytest -import morpheus._lib.messages as _messages from _utils import TEST_DIRS from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemoryNLP from morpheus.messages import MessageMeta from morpheus.messages import TensorMemory from morpheus.stages.inference.triton_inference_stage import TritonInferenceWorker @@ -52,13 +52,13 @@ def build_resp_message(df: DataFrameType, num_cols: int = 2) -> ControlMessage: seq_ids[:, 2] = 42 meta = MessageMeta(df) - mem = _messages.TensorMemory(count=count, - tensors={ - 'confidences': cp.zeros((count, num_cols)), - 'labels': cp.zeros((count, num_cols)), - 'input_ids': cp.zeros((count, num_cols), dtype=cp.float32), - 'seq_ids': seq_ids - }) + mem = TensorMemory(count=count, + tensors={ + 'confidences': cp.zeros((count, num_cols)), + 'labels': cp.zeros((count, num_cols)), + 'input_ids': cp.zeros((count, num_cols), dtype=cp.float32), + 'seq_ids': seq_ids + }) cm = ControlMessage() cm.payload(meta) cm.tensors(mem) @@ -78,10 +78,10 @@ def build_inf_message(df: DataFrameType, mess_count: int, count: int, num_cols: seq_ids[:, 2] = 42 meta = MessageMeta(df) - mem = _messages.InferenceMemoryNLP(count=tensor_length, - input_ids=cp.zeros((tensor_length, num_cols), dtype=cp.float32), - input_mask=cp.zeros((tensor_length, num_cols), dtype=cp.float32), - seq_ids=seq_ids) + mem = InferenceMemoryNLP(count=tensor_length, + input_ids=cp.zeros((tensor_length, num_cols), dtype=cp.float32), + input_mask=cp.zeros((tensor_length, num_cols), dtype=cp.float32), + seq_ids=seq_ids) cm = ControlMessage() cm.payload(meta) cm.tensors(mem) diff --git a/tests/examples/log_parsing/test_postprocessing.py b/tests/examples/log_parsing/test_postprocessing.py index 48baeaddc1..e6271d8a42 100644 --- a/tests/examples/log_parsing/test_postprocessing.py +++ b/tests/examples/log_parsing/test_postprocessing.py @@ -23,12 +23,12 @@ import numpy as np import pytest -import morpheus._lib.messages as _messages from _utils import TEST_DIRS from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory @pytest.fixture(scope='module', name="model_config_file") @@ -55,7 +55,7 @@ def build_post_proc_message(dataset_cudf: DatasetManager, log_test_data_dir: str seq_ids[:, 2] = cp.asarray(host__seq_data)[:, 2] tensors['seq_ids'] = seq_ids - memory = _messages.TensorMemory(count=5, tensors=tensors) + memory = TensorMemory(count=5, tensors=tensors) msg = ControlMessage() msg.payload(meta) diff --git a/tests/examples/ransomware_detection/conftest.py b/tests/examples/ransomware_detection/conftest.py index 7c3ca3e74e..9beffab06a 100644 --- a/tests/examples/ransomware_detection/conftest.py +++ b/tests/examples/ransomware_detection/conftest.py @@ -38,7 +38,7 @@ def dask_distributed_fixture(dask_distributed): @pytest.fixture(name="config") -def config_fixture(config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config): """ The ransomware detection pipeline utilizes the FIL pipeline mode. """ diff --git a/tests/examples/ransomware_detection/test_create_features.py b/tests/examples/ransomware_detection/test_create_features.py index 29c06efdc1..9f5ffa9218 100644 --- a/tests/examples/ransomware_detection/test_create_features.py +++ b/tests/examples/ransomware_detection/test_create_features.py @@ -19,18 +19,15 @@ import typing from unittest import mock -import pytest +import pandas as pd from _utils import TEST_DIRS from _utils.dataset_manager import DatasetManager from morpheus.config import Config -from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import AppShieldMessageMeta from morpheus.pipeline.control_message_stage import ControlMessageStage from morpheus.stages.input.appshield_source_stage import AppShieldSourceStage -@pytest.mark.use_python class TestCreateFeaturesRWStage: # pylint: disable=no-name-in-module @@ -92,18 +89,25 @@ def test_on_next(self, mock_dask_client.submit.return_value = mock_dask_future input_glob = os.path.join(TEST_DIRS.tests_data_dir, 'appshield', 'snapshot-1', '*.json') - input_data = AppShieldSourceStage.files_to_dfs(glob.glob(input_glob), - cols_include=rwd_conf['raw_columns'], - cols_exclude=["SHA256"], - plugins_include=interested_plugins, - encoding='latin1') + appshield_source_stage = AppShieldSourceStage(config, + input_glob, + plugins_include=interested_plugins, + cols_include=rwd_conf['raw_columns'], + cols_exclude=["SHA256"], + encoding='latin1') - input_metas = AppShieldSourceStage._build_metadata(input_data) + input_data = appshield_source_stage.files_to_dfs(glob.glob(input_glob), + cols_include=rwd_conf['raw_columns'], + cols_exclude=["SHA256"], + plugins_include=interested_plugins, + encoding='latin1') + + input_messages = appshield_source_stage._build_messages(input_data) # Make sure the input test date looks the way we expect it - assert len(input_metas) == 1 - input_meta = input_metas[0] - assert input_meta.source == 'appshield' + assert len(input_messages) == 1 + input_message = input_messages[0] + assert input_message.get_metadata('source') == 'appshield' stage = CreateFeaturesRWStage(config, interested_plugins=interested_plugins, @@ -115,71 +119,24 @@ def test_on_next(self, # make sure we have a mocked dask client assert stage._client is mock_dask_client - meta = stage.on_next(input_meta) - assert isinstance(meta, AppShieldMessageMeta) - assert meta.source == input_meta.source + messages = stage.on_next(input_message) + + dataframes = [] + for message in messages: + assert message.get_metadata('source') == input_message.get_metadata('source') + dataframes.append(message.payload().copy_dataframe().to_pandas()) + + actual_df = pd.concat(dataframes, ignore_index=True) + actual_df.sort_values(by=["pid_process", "snapshot_id"], inplace=True) + actual_df.reset_index(drop=True, inplace=True) expected_df = dataset_pandas[os.path.join(test_data_dir, 'dask_results.csv')] expected_df['source_pid_process'] = 'appshield_' + expected_df.pid_process + expected_df['ldrmodules_df_path'] = expected_df['ldrmodules_df_path'].astype(str) # convert to string expected_df.sort_values(by=["pid_process", "snapshot_id"], inplace=True) expected_df.reset_index(drop=True, inplace=True) - dataset_pandas.assert_compare_df(meta.copy_dataframe(), expected_df) - - @mock.patch('stages.create_features.Client') - def test_create_control_messages(self, - mock_dask_client, - config: Config, - rwd_conf: dict, - interested_plugins: typing.List[str], - dataset_pandas: DatasetManager): - from stages.create_features import CreateFeaturesRWStage - mock_dask_client.return_value = mock_dask_client - - pids = [75956, 118469, 1348612, 2698363, 2721362, 2788672] - df = dataset_pandas["filter_probs.csv"] - df['pid_process'] = [ - 2788672, - 75956, - 75956, - 2788672, - 2788672, - 2698363, - 2721362, - 118469, - 1348612, - 2698363, - 118469, - 2698363, - 1348612, - 118469, - 75956, - 2721362, - 75956, - 118469, - 118469, - 118469 - ] - df = df.sort_values(by=["pid_process"]).reset_index(drop=True) - - stage = CreateFeaturesRWStage(config, - interested_plugins=interested_plugins, - feature_columns=rwd_conf['model_features'], - file_extns=rwd_conf['file_extensions'], - n_workers=5, - threads_per_worker=6) - - meta = AppShieldMessageMeta(df, source='tests') - control_messages = stage.create_control_messages(meta) - assert len(control_messages) == len(pids) - - prev_loc = 0 - for (i, _control_message) in enumerate(control_messages): - assert isinstance(_control_message, ControlMessage) - pid = pids[i] - (_control_message.payload().get_data(['pid_process']) == pid).all() - prev_loc = prev_loc + _control_message.payload().count - assert prev_loc == len(df) + dataset_pandas.assert_compare_df(actual_df, expected_df) @mock.patch('stages.create_features.Client') def test_on_completed(self, mock_dask_client, config: Config, rwd_conf: dict, interested_plugins: typing.List[str]): diff --git a/tests/examples/ransomware_detection/test_preprocessing.py b/tests/examples/ransomware_detection/test_preprocessing.py index ad9d3b74eb..9d1f8e81ef 100644 --- a/tests/examples/ransomware_detection/test_preprocessing.py +++ b/tests/examples/ransomware_detection/test_preprocessing.py @@ -20,11 +20,10 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import AppShieldMessageMeta +from morpheus.messages import MessageMeta from morpheus.stages.preprocess.preprocess_base_stage import PreprocessBaseStage -@pytest.mark.use_python class TestPreprocessingRWStage: # pylint: disable=no-name-in-module @@ -147,22 +146,19 @@ def test_merge_curr_and_prev_snapshots(self, config: Config, rwd_conf: dict, dat stage._merge_curr_and_prev_snapshots(df, source_pid_process) dataset_pandas.assert_compare_df(df.fillna(''), expected_df) - def test_pre_process_batch(self, config: Config, rwd_conf: dict, dataset_pandas: DatasetManager): - - # Pylint currently fails to work with classmethod: https://github.com/pylint-dev/pylint/issues/981 - # pylint: disable=no-member - + def test_pre_process_batch(self, config: Config, rwd_conf: dict, dataset_cudf: DatasetManager): from stages.preprocessing import PreprocessingRWStage - df = dataset_pandas['examples/ransomware_detection/dask_results.csv'] + df = dataset_cudf['examples/ransomware_detection/dask_results.csv'] df['source_pid_process'] = 'appshield_' + df.pid_process expected_df = df.copy(deep=True).fillna('') - meta = AppShieldMessageMeta(df=df, source='tests') - control_msg = ControlMessage() - control_msg.payload(meta) + meta = MessageMeta(df) + cm = ControlMessage() + cm.payload(meta) + cm.set_metadata('source', 'tests') sliding_window = 4 stage = PreprocessingRWStage(config, feature_columns=rwd_conf['model_features'], sliding_window=sliding_window) - results: ControlMessage = stage._pre_process_batch(control_msg) + results: ControlMessage = stage._pre_process_batch(cm) assert isinstance(results, ControlMessage) expected_df['sequence'] = ['dummy' for _ in range(len(expected_df))] @@ -171,6 +167,9 @@ def test_pre_process_batch(self, config: Config, rwd_conf: dict, dataset_pandas: expected_seq_ids[:, 0] = cp.arange(0, len(expected_df), dtype=cp.uint32) expected_seq_ids[:, 2] = len(rwd_conf['model_features']) * 3 - dataset_pandas.assert_compare_df(results.payload().get_data().fillna(''), expected_df) - assert (results.tensors().get_tensor('input__0') == expected_input__0).all() - assert (results.tensors().get_tensor('seq_ids') == expected_seq_ids).all() + actual_df = results.payload().copy_dataframe().to_pandas().fillna('') + dataset_cudf.assert_compare_df(actual_df, expected_df) + + actual_tensors = results.tensors() + assert (actual_tensors.get_tensor('input__0') == expected_input__0).all() + assert (actual_tensors.get_tensor('seq_ids') == expected_seq_ids).all() diff --git a/tests/morpheus/apps/test_abp.py b/tests/morpheus/apps/test_abp.py index 17d23f248a..f90f9e1eef 100755 --- a/tests/morpheus/apps/test_abp.py +++ b/tests/morpheus/apps/test_abp.py @@ -15,15 +15,12 @@ # limitations under the License. import os -from unittest import mock -import numpy as np import pytest from _utils import TEST_DIRS from _utils import calc_error_val from _utils import compare_class_to_scores -from _utils import mk_async_infer from morpheus.config import Config from morpheus.config import ConfigFIL from morpheus.config import PipelineModes @@ -48,71 +45,7 @@ @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_no_cpp(mock_triton_client: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - 'name': 'output__0', 'datatype': 'FP32', 'shape': ['-1', '1'] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_abp_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.FIL - config.class_labels = ["mining"] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.fil = ConfigFIL() - config.fil.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_fil.txt')) - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'abp-validation-data.jsonlines') - out_file = os.path.join(tmp_path, 'results.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(PreprocessFILStage(config)) - pipe.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config)) - pipe.add_stage(AddScoresStage(config, prefix="score_")) - pipe.add_stage( - ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - compare_class_to_scores(out_file, config.class_labels, '', 'score_', threshold=0.5) - results = calc_error_val(results_file_name) - assert results.diff_rows == 0 - - -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): config.mode = PipelineModes.FIL @@ -161,90 +94,7 @@ def test_abp_cpp(config: Config, tmp_path: str, morpheus_log_level: int): @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_multi_segment_no_cpp(mock_triton_client: mock.MagicMock, - config: Config, - tmp_path: str, - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - 'name': 'output__0', 'datatype': 'FP32', 'shape': ['-1', '1'] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_abp_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.FIL - config.class_labels = ["mining"] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.fil = ConfigFIL() - config.fil.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_fil.txt')) - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'abp-validation-data.jsonlines') - out_file = os.path.join(tmp_path, 'results.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 1 - - pipe.add_stage(PreprocessFILStage(config)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 2 - - pipe.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 3 - - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 4 - - pipe.add_stage( - ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) - - pipe.add_segment_boundary(ControlMessage) # Boundary 5 - - pipe.add_stage(SerializeStage(config)) - - pipe.add_segment_boundary(MessageMeta) # Boundary 6 - - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - results = calc_error_val(results_file_name) - assert results.diff_rows == 0 - - -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_abp_multi_segment_cpp(config, tmp_path): diff --git a/tests/morpheus/apps/test_abp_kafka.py b/tests/morpheus/apps/test_abp_kafka.py index 46306ff29c..7c241b8a5d 100755 --- a/tests/morpheus/apps/test_abp_kafka.py +++ b/tests/morpheus/apps/test_abp_kafka.py @@ -17,14 +17,11 @@ import os import typing from io import StringIO -from unittest import mock -import numpy as np import pandas import pytest from _utils import TEST_DIRS -from _utils import mk_async_infer from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from _utils.kafka import write_file_to_kafka @@ -54,100 +51,7 @@ @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_abp_no_cpp(mock_triton_client: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - 'name': 'input__0', 'datatype': 'FP32', "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - 'name': 'output__0', 'datatype': 'FP32', 'shape': ['-1', '1'] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_abp_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.FIL - config.class_labels = ["mining"] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - config.fil = ConfigFIL() - config.fil.feature_columns = load_labels_file(os.path.join(TEST_DIRS.data_dir, 'columns_fil.txt')) - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'abp-validation-data.jsonlines') - - # Fill our topic with the input data - num_records = write_file_to_kafka(kafka_bootstrap_servers, kafka_topics.input_topic, val_file_name) - - pipe = LinearPipeline(config) - pipe.set_source( - KafkaSourceStage(config, - bootstrap_servers=kafka_bootstrap_servers, - input_topic=kafka_topics.input_topic, - auto_offset_reset="earliest", - poll_interval="1seconds", - stop_after=num_records, - client_id="test_abp_no_cpp_reader")) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(PreprocessFILStage(config)) - pipe.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage( - WriteToKafkaStage(config, - bootstrap_servers=kafka_bootstrap_servers, - output_topic=kafka_topics.output_topic, - client_id="test_abp_no_cpp_writer")) - - pipe.run() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f'{rec.value.decode("utf-8")}\n') - - output_buf.seek(0) - output_df = pandas.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == num_records - - results = compare_df(val_df, output_df, exclude_columns=[r'^ID$', r'^_ts_'], rel_tol=0.05) - - assert results['diff_rows'] == 0 - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_abp_cpp(config: Config, dataset_pandas: DatasetManager, diff --git a/tests/morpheus/apps/test_phishing.py b/tests/morpheus/apps/test_phishing.py index 77e752ef3f..41f3d9c7c2 100755 --- a/tests/morpheus/apps/test_phishing.py +++ b/tests/morpheus/apps/test_phishing.py @@ -15,14 +15,11 @@ # limitations under the License. import os -from unittest import mock -import numpy as np import pytest from _utils import TEST_DIRS from _utils import calc_error_val -from _utils import mk_async_infer from morpheus.config import Config from morpheus.config import PipelineModes from morpheus.pipeline import LinearPipeline @@ -43,75 +40,7 @@ @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_email_no_cpp(mock_triton_client: mock.MagicMock, config: Config, tmp_path: str, morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - "name": "input_ids", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }, { - "name": "attention_mask", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - "name": "output", "datatype": "FP32", "shape": [-1, 2] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_phishing_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.NLP - config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'phishing-email-validation-data.jsonlines') - vocab_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-uncased-hash.txt') - out_file = os.path.join(tmp_path, 'results.csv') - results_file_name = os.path.join(tmp_path, 'results.json') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage( - PreprocessNLPStage(config, - vocab_hash_file=vocab_file_name, - truncation=True, - do_lower_case=True, - add_special_tokens=False)) - pipe.add_stage( - TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', - force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) - pipe.add_stage( - ValidationStage(config, val_file_name=val_file_name, results_file_name=results_file_name, rel_tol=0.05)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False)) - - pipe.run() - results = calc_error_val(results_file_name) - assert results.diff_rows == 153 - - -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_email_cpp(config: Config, tmp_path: str, morpheus_log_level: int): config.mode = PipelineModes.NLP diff --git a/tests/morpheus/apps/test_phishing_kafka.py b/tests/morpheus/apps/test_phishing_kafka.py index 1a04061cc9..3524cc62f4 100755 --- a/tests/morpheus/apps/test_phishing_kafka.py +++ b/tests/morpheus/apps/test_phishing_kafka.py @@ -17,14 +17,11 @@ import os import typing from io import StringIO -from unittest import mock -import numpy as np import pandas import pytest from _utils import TEST_DIRS -from _utils import mk_async_infer from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from _utils.kafka import write_file_to_kafka @@ -53,103 +50,7 @@ @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_email_no_cpp(mock_triton_client: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - "name": "input_ids", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }, { - "name": "attention_mask", "datatype": "INT64", "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - "name": "output", "datatype": "FP32", "shape": [-1, 2] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_phishing_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.NLP - config.class_labels = load_labels_file(os.path.join(TEST_DIRS.data_dir, "labels_phishing.txt")) - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'phishing-email-validation-data.jsonlines') - vocab_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-uncased-hash.txt') - - num_records = write_file_to_kafka(kafka_bootstrap_servers, kafka_topics.input_topic, val_file_name) - - # Disabling commits due to known issue in Python impl: https://github.com/nv-morpheus/Morpheus/issues/294 - pipe = LinearPipeline(config) - pipe.set_source( - KafkaSourceStage(config, - bootstrap_servers=kafka_bootstrap_servers, - input_topic=kafka_topics.input_topic, - auto_offset_reset="earliest", - poll_interval="1seconds", - disable_commit=True, - stop_after=num_records)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage( - PreprocessNLPStage(config, - vocab_hash_file=vocab_file_name, - truncation=True, - do_lower_case=True, - add_special_tokens=False)) - pipe.add_stage( - TritonInferenceStage(config, model_name='phishing-bert-onnx', server_url='test:0000', - force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config, labels=["is_phishing"], threshold=0.7)) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage( - WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) - - pipe.run() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f"{rec.value.decode('utf-8')}\n") - - output_buf.seek(0) - output_df = pandas.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == num_records - - results = compare_df(val_df, output_df, exclude_columns=[r'^ID$', r'^_ts_'], rel_tol=0.05) - - assert results['diff_rows'] == 153 - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_email_cpp(dataset_pandas: DatasetManager, config: Config, diff --git a/tests/morpheus/apps/test_sid.py b/tests/morpheus/apps/test_sid.py index 304d6a5f04..4fb5616b82 100755 --- a/tests/morpheus/apps/test_sid.py +++ b/tests/morpheus/apps/test_sid.py @@ -169,7 +169,7 @@ def _run_minibert(*, @pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_minibert_no_trunc(config: Config, tmp_path: str, morpheus_log_level: int): diff --git a/tests/morpheus/apps/test_sid_kafka.py b/tests/morpheus/apps/test_sid_kafka.py index eb70a98fc9..5d85188d6f 100755 --- a/tests/morpheus/apps/test_sid_kafka.py +++ b/tests/morpheus/apps/test_sid_kafka.py @@ -17,14 +17,11 @@ import os import typing from io import StringIO -from unittest import mock -import numpy as np import pandas import pytest from _utils import TEST_DIRS -from _utils import mk_async_infer from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from morpheus.config import Config @@ -51,102 +48,7 @@ @pytest.mark.kafka @pytest.mark.slow -@pytest.mark.use_python -@mock.patch('tritonclient.grpc.InferenceServerClient') -def test_minibert_no_cpp(mock_triton_client: mock.MagicMock, - dataset_pandas: DatasetManager, - config: Config, - kafka_bootstrap_servers: str, - kafka_topics: KafkaTopics, - kafka_consumer: "KafkaConsumer", - morpheus_log_level: int): - mock_metadata = { - "inputs": [{ - "name": "input_ids", "datatype": "INT32", "shape": [-1, FEATURE_LENGTH] - }, { - "name": "attention_mask", "datatype": "INT32", "shape": [-1, FEATURE_LENGTH] - }], - "outputs": [{ - "name": "output", "datatype": "FP32", "shape": [-1, 10] - }] - } - mock_model_config = {"config": {"max_batch_size": MODEL_MAX_BATCH_SIZE}} - - mock_triton_client.return_value = mock_triton_client - mock_triton_client.is_server_live.return_value = True - mock_triton_client.is_server_ready.return_value = True - mock_triton_client.is_model_ready.return_value = True - mock_triton_client.get_model_metadata.return_value = mock_metadata - mock_triton_client.get_model_config.return_value = mock_model_config - - data = np.loadtxt(os.path.join(TEST_DIRS.tests_data_dir, 'triton_sid_inf_results.csv'), delimiter=',') - inf_results = np.split(data, range(MODEL_MAX_BATCH_SIZE, len(data), MODEL_MAX_BATCH_SIZE)) - - async_infer = mk_async_infer(inf_results) - mock_triton_client.async_infer.side_effect = async_infer - - config.mode = PipelineModes.NLP - config.class_labels = [ - "address", - "bank_acct", - "credit_card", - "email", - "govt_id", - "name", - "password", - "phone_num", - "secret_keys", - "user" - ] - config.model_max_batch_size = MODEL_MAX_BATCH_SIZE - config.pipeline_batch_size = 1024 - config.feature_length = FEATURE_LENGTH - config.edge_buffer_size = 128 - config.num_threads = 1 - - val_file_name = os.path.join(TEST_DIRS.validation_data_dir, 'sid-validation-data.csv') - vocab_file_name = os.path.join(TEST_DIRS.data_dir, 'bert-base-uncased-hash.txt') - - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=val_file_name, iterative=False)) - pipe.add_stage(DeserializeStage(config)) - pipe.add_stage( - PreprocessNLPStage(config, - vocab_hash_file=vocab_file_name, - truncation=True, - do_lower_case=True, - add_special_tokens=False)) - pipe.add_stage( - TritonInferenceStage(config, model_name='sid-minibert-onnx', server_url='fake:001', force_convert_inputs=True)) - pipe.add_stage( - MonitorStage(config, description="Inference Rate", smoothing=0.001, unit="inf", log_level=morpheus_log_level)) - pipe.add_stage(AddClassificationsStage(config, threshold=0.5, prefix="si_")) - pipe.add_stage(SerializeStage(config)) - pipe.add_stage( - WriteToKafkaStage(config, bootstrap_servers=kafka_bootstrap_servers, output_topic=kafka_topics.output_topic)) - - pipe.run() - - val_df = dataset_pandas[val_file_name] - - output_buf = StringIO() - for rec in kafka_consumer: - output_buf.write(f"{rec.value.decode('utf-8')}\n") - - output_buf.seek(0) - output_df = pandas.read_json(output_buf, lines=True) - output_df = filter_null_data(output_df) - - assert len(output_df) == len(val_df) - - results = compare_df(val_df, output_df, exclude_columns=[r'^ID$', r'^_ts_'], rel_tol=0.05) - - assert results['diff_rows'] == 1333 - - -@pytest.mark.kafka -@pytest.mark.slow -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("launch_mock_triton") def test_minibert_cpp(dataset_pandas: DatasetManager, config: Config, diff --git a/tests/morpheus/controllers/test_elasticsearch_controller.py b/tests/morpheus/controllers/test_elasticsearch_controller.py index 903e4bf14f..3a136b0cd8 100644 --- a/tests/morpheus/controllers/test_elasticsearch_controller.py +++ b/tests/morpheus/controllers/test_elasticsearch_controller.py @@ -48,14 +48,12 @@ def inner_create_controller(*, connection_kwargs=connection_kwargs, refresh_peri yield inner_create_controller -@pytest.mark.use_python def test_constructor(create_controller: typing.Callable[..., ElasticsearchController], connection_kwargs: dict): assert create_controller(raise_on_exception=True)._raise_on_exception is True assert create_controller(refresh_period_secs=1.5)._refresh_period_secs == 1.5 assert create_controller()._connection_kwargs == connection_kwargs -@pytest.mark.use_python def test_refresh_client_force(create_controller: typing.Callable[..., ElasticsearchController]): controller = create_controller(refresh_period_secs=1) @@ -68,7 +66,6 @@ def test_refresh_client_force(create_controller: typing.Callable[..., Elasticsea assert controller._last_refresh_time > 0 -@pytest.mark.use_python def test_refresh_client_not_needed(create_controller: typing.Callable[..., ElasticsearchController]): controller = create_controller() client = controller._client @@ -81,7 +78,6 @@ def test_refresh_client_not_needed(create_controller: typing.Callable[..., Elast assert is_refreshed is False -@pytest.mark.use_python def test_refresh_client_needed(create_controller: typing.Callable[..., ElasticsearchController]): # Set a 1 second refresh period @@ -98,7 +94,6 @@ def test_refresh_client_needed(create_controller: typing.Callable[..., Elasticse assert is_refreshed is True -@pytest.mark.use_python @patch("morpheus.controllers.elasticsearch_controller.parallel_bulk", return_value=[(True, None)]) def test_parallel_bulk_write(mock_parallel_bulk, create_controller: typing.Callable[..., ElasticsearchController]): # Define your mock actions @@ -108,7 +103,6 @@ def test_parallel_bulk_write(mock_parallel_bulk, create_controller: typing.Calla mock_parallel_bulk.assert_called_once() -@pytest.mark.use_python @patch("morpheus.controllers.elasticsearch_controller.parallel_bulk", return_value=[(True, None)]) def test_df_to_parallel_bulk_write(mock_parallel_bulk: typing.Callable, create_controller: typing.Callable[..., ElasticsearchController]): diff --git a/tests/morpheus/dfencoder/test_autoencoder.py b/tests/morpheus/dfencoder/test_autoencoder.py index bd02907f92..be11cb4cf8 100755 --- a/tests/morpheus/dfencoder/test_autoencoder.py +++ b/tests/morpheus/dfencoder/test_autoencoder.py @@ -37,7 +37,7 @@ from morpheus.models.dfencoder.dataloader import FileSystemDataset # Only pandas and Python is supported -pytestmark = [pytest.mark.use_pandas, pytest.mark.use_python] +pytestmark = [pytest.mark.use_pandas, pytest.mark.cpu_mode] BIN_COLS = ['ts_anomaly'] diff --git a/tests/morpheus/dfencoder/test_pkg.py b/tests/morpheus/dfencoder/test_pkg.py deleted file mode 100755 index 3b5d39585c..0000000000 --- a/tests/morpheus/dfencoder/test_pkg.py +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - - -@pytest.mark.skip -def test_old_dfencoder_not_in_env(): - """ - Verify the old external dfencoder doesn't exist in the current env - """ - with pytest.raises(ModuleNotFoundError): - import dfencoder # noqa: F401 #pylint:disable=unused-import diff --git a/tests/morpheus/io/test_io_utils.py b/tests/morpheus/io/test_io_utils.py index 1ad46b75cb..3c3e241ce8 100755 --- a/tests/morpheus/io/test_io_utils.py +++ b/tests/morpheus/io/test_io_utils.py @@ -14,14 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +import typing from collections.abc import Callable +import pandas as pd import pytest import cudf from _utils.dataset_manager import DatasetManager +from morpheus.config import ExecutionMode from morpheus.io import utils as io_utils +from morpheus.utils.type_aliases import DataFrameModule from morpheus.utils.type_aliases import DataFrameType MULTI_BYTE_STRINGS = ["ñäμɛ", "Moρφέας", "taç"] @@ -132,3 +136,15 @@ def test_truncate_string_cols_by_bytes(dataset: DatasetManager, assert isinstance(df, expected_df_class) dataset.assert_df_equal(df, expected_df) + + +@pytest.mark.parametrize("mode, expected", + [(ExecutionMode.GPU, cudf.read_json), (ExecutionMode.CPU, pd.read_json), + ("cudf", cudf.read_json), ("pandas", pd.read_json)]) +def test_get_json_reader(mode: typing.Union[ExecutionMode, DataFrameModule], expected: Callable[..., DataFrameType]): + reader = io_utils.get_json_reader(mode) + if hasattr(reader, "func"): + # Unwrap partial + reader = reader.func + + assert reader is expected diff --git a/tests/morpheus/messages/test_control_message.py b/tests/morpheus/messages/test_control_message.py index 85f2aa344f..b9ba42d079 100644 --- a/tests/morpheus/messages/test_control_message.py +++ b/tests/morpheus/messages/test_control_message.py @@ -18,24 +18,25 @@ import io import sys -import cupy as cp import pytest from _utils.dataset_manager import DatasetManager from morpheus import messages +from morpheus.config import Config from morpheus.messages import TensorMemory +from morpheus.utils.type_utils import get_array_pkg # pylint: disable=unsupported-membership-test # pylint: disable=unsubscriptable-object -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_init(): messages.ControlMessage() # noqa: F841 messages.ControlMessage({"test": "test"}) # noqa: F841 -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_tasks(): message = messages.ControlMessage() assert len(message.get_tasks()) == 0 @@ -70,12 +71,6 @@ def test_control_message_tasks(): assert message.get_tasks()["type_a"][0]["key_x"] == "value_x" assert message.get_tasks()["type_a"][1]["key_y"] == "value_y" - # Ensure the underlying tasks cannot are not modified - message = messages.ControlMessage() - tasks = message.get_tasks() - tasks["type_a"] = [{"key_x", "value_x"}] # pylint: disable=unsupported-assignment-operation - assert len(message.get_tasks()) == 0 - message = messages.ControlMessage() message.add_task("type_a", {"key_x": "value_x"}) message.add_task("type_a", {"key_y": "value_y"}) @@ -86,7 +81,7 @@ def test_control_message_tasks(): assert message.get_tasks()["type_a"][1]["key_y"] == "value_y" -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_metadata(): message = messages.ControlMessage() @@ -108,11 +103,8 @@ def test_control_message_metadata(): assert message.get_metadata()["key_y"] == "value_yy" - message.get_metadata()["not_mutable"] = 5 # pylint: disable=unsupported-assignment-operation - - assert "not_mutable" not in message.get_metadata() - +@pytest.mark.gpu_and_cpu_mode def test_set_and_get_metadata(): message = messages.ControlMessage() @@ -132,6 +124,7 @@ def test_set_and_get_metadata(): assert all_metadata["another_key"] == "another_value" +@pytest.mark.gpu_and_cpu_mode def test_list_metadata(): message = messages.ControlMessage() @@ -146,6 +139,7 @@ def test_list_metadata(): assert set(keys) == {"key1", "key2", "key3"} +@pytest.mark.gpu_and_cpu_mode def test_get_metadata_default_value(): message = messages.ControlMessage() @@ -159,7 +153,7 @@ def test_get_metadata_default_value(): assert message.get_metadata("non_existing_key", "default_value") == "default_value" -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_get(): raw_control_message = messages.ControlMessage({ "test": "test_rcm", "tasks": [{ @@ -183,7 +177,7 @@ def test_control_message_get(): assert (control_message.has_task("load")) -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_control_message_set(): raw_control_message = messages.ControlMessage() control_message = messages.ControlMessage() @@ -204,6 +198,7 @@ def test_control_message_set(): assert (control_message.has_task("load")) +@pytest.mark.gpu_and_cpu_mode def test_control_message_set_and_get_payload(dataset: DatasetManager): df = dataset["test_dataframe.jsonlines"] @@ -217,7 +212,7 @@ def test_control_message_set_and_get_payload(dataset: DatasetManager): DatasetManager.assert_df_equal(payload.df, payload2.df) -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_set_and_get_timestamp_single(): # Create a ControlMessage instance msg = messages.ControlMessage() @@ -234,7 +229,7 @@ def test_set_and_get_timestamp_single(): assert result == timestamp, "The retrieved timestamp should match the one that was set." -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_filter_timestamp(): # Create a ControlMessage instance msg = messages.ControlMessage() @@ -255,7 +250,7 @@ def test_filter_timestamp(): assert result[f"{group}::key2"] == timestamp2, "The timestamp for key2 should match." -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_modetest_tensor_manipulation_after_retrieval def test_get_timestamp_fail_if_nonexist(): # Create a ControlMessage instance msg = messages.ControlMessage() @@ -269,10 +264,15 @@ def test_get_timestamp_fail_if_nonexist(): assert str(exc_info.value) == "Timestamp for the specified key does not exist." -# Test setting and getting tensors with cupy arrays -@pytest.mark.usefixtures("config_only_cpp") -def test_tensors_setting_and_getting(): - data = {"input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1])} +@pytest.mark.gpu_and_cpu_mode +def test_tensors_setting_and_getting(config: Config): + # Test setting and getting tensors with cupy/numpy arrays + array_pkg = get_array_pkg(config.execution_mode) + data = { + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) + } message = messages.ControlMessage() tensor_memory = TensorMemory(count=data["input_ids"].shape[0]) tensor_memory.set_tensors(data) @@ -283,14 +283,17 @@ def test_tensors_setting_and_getting(): assert retrieved_tensors.count == data["input_ids"].shape[0], "Tensor count mismatch." for key, val in data.items(): - assert cp.allclose(retrieved_tensors.get_tensor(key), val), f"Mismatch in tensor data for {key}." + assert array_pkg.allclose(retrieved_tensors.get_tensor(key), val), f"Mismatch in tensor data for {key}." -# Test retrieving tensor names and checking specific tensor existence -@pytest.mark.usefixtures("config_only_cpp") -def test_tensor_names_and_existence(): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_names_and_existence(config: Config): + # Test retrieving tensor names and checking specific tensor existence + array_pkg = get_array_pkg(config.execution_mode) tokenized_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } message = messages.ControlMessage() tensor_memory = TensorMemory(count=tokenized_data["input_ids"].shape[0], tensors=tokenized_data) @@ -303,11 +306,14 @@ def test_tensor_names_and_existence(): assert retrieved_tensors.has_tensor(key), f"Tensor {key} should exist." -# Test manipulating tensors after retrieval -@pytest.mark.usefixtures("config_only_cpp") -def test_tensor_manipulation_after_retrieval(): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_manipulation_after_retrieval(config: Config): + # Test manipulating tensors after retrieval + array_pkg = get_array_pkg(config.execution_mode) tokenized_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=tokenized_data) @@ -315,17 +321,20 @@ def test_tensor_manipulation_after_retrieval(): message.tensors(tensor_memory) retrieved_tensors = message.tensors() - new_tensor = cp.array([4, 5, 6]) + new_tensor = array_pkg.array([4, 5, 6]) retrieved_tensors.set_tensor("new_tensor", new_tensor) - assert cp.allclose(retrieved_tensors.get_tensor("new_tensor"), new_tensor), "New tensor data mismatch." + assert array_pkg.allclose(retrieved_tensors.get_tensor("new_tensor"), new_tensor), "New tensor data mismatch." -# Assuming there's functionality to update all tensors at once -@pytest.mark.usefixtures("config_only_cpp") -def test_tensor_update(): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_update(config: Config): + # Assuming there's functionality to update all tensors at once + array_pkg = get_array_pkg(config.execution_mode) tokenized_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=tokenized_data) @@ -334,7 +343,9 @@ def test_tensor_update(): # Update tensors with new data new_tensors = { - "input_ids": cp.array([4, 5, 6]), "input_mask": cp.array([1, 0, 1]), "segment_ids": cp.array([1, 1, 0]) + "input_ids": array_pkg.array([4, 5, 6]), + "input_mask": array_pkg.array([1, 0, 1]), + "segment_ids": array_pkg.array([1, 1, 0]) } tensor_memory.set_tensors(new_tensors) @@ -342,13 +353,14 @@ def test_tensor_update(): updated_tensors = message.tensors() for key, val in new_tensors.items(): - assert cp.allclose(updated_tensors.get_tensor(key), val), f"Mismatch in updated tensor data for {key}." + assert array_pkg.allclose(updated_tensors.get_tensor(key), val), f"Mismatch in updated tensor data for {key}." -@pytest.mark.usefixtures("config_only_cpp") -def test_update_individual_tensor(): - initial_data = {"input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1])} - update_data = {"input_ids": cp.array([4, 5, 6])} +@pytest.mark.gpu_and_cpu_mode +def test_update_individual_tensor(config: Config): + array_pkg = get_array_pkg(config.execution_mode) + initial_data = {"input_ids": array_pkg.array([1, 2, 3]), "input_mask": array_pkg.array([1, 1, 1])} + update_data = {"input_ids": array_pkg.array([4, 5, 6])} message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=initial_data) message.tensors(tensor_memory) @@ -358,14 +370,14 @@ def test_update_individual_tensor(): retrieved_tensors = message.tensors() # Check updated tensor - assert cp.allclose(retrieved_tensors.get_tensor("input_ids"), - update_data["input_ids"]), "Input IDs update mismatch." + assert array_pkg.allclose(retrieved_tensors.get_tensor("input_ids"), + update_data["input_ids"]), "Input IDs update mismatch." # Ensure other tensor remains unchanged - assert cp.allclose(retrieved_tensors.get_tensor("input_mask"), - initial_data["input_mask"]), "Input mask should remain unchanged after updating input_ids." + assert array_pkg.allclose(retrieved_tensors.get_tensor("input_mask"), + initial_data["input_mask"]), "input_mask should be unchanged after updating input_ids." -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_and_cpu_mode def test_behavior_with_empty_tensors(): message = messages.ControlMessage() tensor_memory = TensorMemory(count=0) @@ -376,26 +388,27 @@ def test_behavior_with_empty_tensors(): assert len(retrieved_tensors.tensor_names) == 0, "There should be no tensor names for empty tensor memory." -@pytest.mark.usefixtures("config_only_cpp") -def test_consistency_after_multiple_operations(): - initial_data = {"input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1])} +@pytest.mark.gpu_and_cpu_mode +def test_consistency_after_multiple_operations(config: Config): + array_pkg = get_array_pkg(config.execution_mode) + initial_data = {"input_ids": array_pkg.array([1, 2, 3]), "input_mask": array_pkg.array([1, 1, 1])} message = messages.ControlMessage() tensor_memory = TensorMemory(count=3, tensors=initial_data) message.tensors(tensor_memory) # Update a tensor - tensor_memory.set_tensor("input_ids", cp.array([4, 5, 6])) + tensor_memory.set_tensor("input_ids", array_pkg.array([4, 5, 6])) # Remove another tensor # Add a new tensor - new_tensor = {"new_tensor": cp.array([7, 8, 9])} + new_tensor = {"new_tensor": array_pkg.array([7, 8, 9])} tensor_memory.set_tensor("new_tensor", new_tensor["new_tensor"]) retrieved_tensors = message.tensors() assert retrieved_tensors.count == 3, "Tensor count mismatch after multiple operations." - assert cp.allclose(retrieved_tensors.get_tensor("input_ids"), - cp.array([4, 5, 6])), "Mismatch in input_ids after update." - assert cp.allclose(retrieved_tensors.get_tensor("new_tensor"), - new_tensor["new_tensor"]), "New tensor data mismatch." + assert array_pkg.allclose(retrieved_tensors.get_tensor("input_ids"), + array_pkg.array([4, 5, 6])), "Mismatch in input_ids after update." + assert array_pkg.allclose(retrieved_tensors.get_tensor("new_tensor"), + new_tensor["new_tensor"]), "New tensor data mismatch." class NonSerializablePyClass(): @@ -428,7 +441,7 @@ def fixture_pyobject(request): return request.param() -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_mode def test_metadata_holds_non_serializable_python_obj(py_object): message = messages.ControlMessage() @@ -452,7 +465,7 @@ def test_metadata_holds_non_serializable_python_obj(py_object): assert obj is metadata_dict_with_obj["nested_obj"] -@pytest.mark.usefixtures("config_only_cpp") +@pytest.mark.gpu_mode def test_tasks_hold_non_serializable_python_obj(py_object): message = messages.ControlMessage() diff --git a/tests/morpheus/messages/test_message_meta.py b/tests/morpheus/messages/test_message_meta.py index b5e2606976..db28ea80d7 100644 --- a/tests/morpheus/messages/test_message_meta.py +++ b/tests/morpheus/messages/test_message_meta.py @@ -37,10 +37,8 @@ def fixture_index_type(request: pytest.FixtureRequest) -> typing.Literal["normal @pytest.fixture(name="df", scope="function") def fixture_df( - use_cpp: bool, # pylint: disable=unused-argument - dataset: DatasetManager, - index_type: typing.Literal['normal', 'skip', 'dup', 'down', - 'updown']) -> typing.Union[cudf.DataFrame, pd.DataFrame]: + dataset: DatasetManager, index_type: typing.Literal['normal', 'skip', 'dup', 'down', + 'updown']) -> typing.Union[cudf.DataFrame, pd.DataFrame]: test_df = dataset["test_dataframe.jsonlines"] if (index_type == "normal"): @@ -296,7 +294,7 @@ def test_update_dataframe(df: DataFrameType): assert meta.get_data()[col_new_int_name].isin(col_new_int).all() # pylint: disable=unsubscriptable-object -@pytest.mark.use_cpp +@pytest.mark.gpu_mode def test_pandas_df_cpp(dataset_pandas: DatasetManager): """ Test for issue #821, calling the `df` property returns an empty cudf dataframe. @@ -324,12 +322,12 @@ def test_cast(config: Config, dataset: DatasetManager): # pylint: disable=unuse @pytest.mark.use_pandas -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_cast_python_to_cpp(dataset: DatasetManager): """ Test that we can cast a python MessageMeta to a C++ MessageMeta """ - df = dataset["test_dataframe.jsonlines"] + df = dataset["filter_probs.csv"] py_meta = MessageMeta(df) assert isinstance(py_meta, MessageMeta) @@ -343,12 +341,12 @@ def test_cast_python_to_cpp(dataset: DatasetManager): @pytest.mark.use_pandas -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_cast_cpp_to_python(dataset: DatasetManager): """ Test that we can cast a a C++ MessageMeta to a python MessageMeta """ - df = dataset["test_dataframe.jsonlines"] + df = dataset["filter_probs.csv"] cpp_meta = MessageMetaCpp(df) py_meta = MessageMeta(cpp_meta) diff --git a/tests/morpheus/messages/test_messages.py b/tests/morpheus/messages/test_messages.py index 6c376f7e54..9fb99f1fd5 100644 --- a/tests/morpheus/messages/test_messages.py +++ b/tests/morpheus/messages/test_messages.py @@ -13,9 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import importlib -import os - import cupy as cp import pytest @@ -23,6 +20,7 @@ import morpheus._lib.messages as _messages import morpheus.config +import morpheus.utils.type_utils from morpheus import messages from morpheus.messages.memory import tensor_memory @@ -96,18 +94,6 @@ def check_all_messages(should_be_cpp: bool, no_cpp_class: bool): check_message(messages.ResponseMemoryAE, None, should_be_cpp, no_cpp_class, **{"count": 1, "probs": cp_array}) +@pytest.mark.gpu_mode def test_constructor_cpp(): check_all_messages(morpheus.config.CppConfig.get_should_use_cpp(), False) - - -@pytest.mark.reload_modules(morpheus.config) -@pytest.mark.usefixtures("reload_modules", "restore_environ") -def test_constructor_env(): - # Set the NO_CPP flag which should disable C++ regardless - os.environ['MORPHEUS_NO_CPP'] = '1' - - # Reload the CppConfig class just in case - importlib.reload(morpheus.config) - - # Check all messages. Should be False regardless due to the environment variable - check_all_messages(False, False) diff --git a/tests/morpheus/messages/test_tensor_memory.py b/tests/morpheus/messages/test_tensor_memory.py index e3f072277c..7e8d3be655 100644 --- a/tests/morpheus/messages/test_tensor_memory.py +++ b/tests/morpheus/messages/test_tensor_memory.py @@ -16,14 +16,13 @@ import os import string +import types import typing -import cupy as cp import numpy as np import pytest from _utils import TEST_DIRS -from morpheus.config import Config from morpheus.messages.memory.inference_memory import InferenceMemory from morpheus.messages.memory.inference_memory import InferenceMemoryAE from morpheus.messages.memory.inference_memory import InferenceMemoryFIL @@ -33,6 +32,7 @@ from morpheus.messages.memory.response_memory import ResponseMemoryProbs from morpheus.messages.memory.tensor_memory import TensorMemory from morpheus.utils.type_aliases import DataFrameType +from morpheus.utils.type_aliases import NDArrayType INPUT_FILE = os.path.join(TEST_DIRS.tests_data_dir, 'filter_probs.csv') @@ -40,14 +40,14 @@ # pylint: disable=unused-argument -def compare_tensors(tensors1: typing.Dict[str, cp.ndarray], tensors2: typing.Dict[str, cp.ndarray]): +def compare_tensors(tensors1: typing.Dict[str, NDArrayType], tensors2: typing.Dict[str, NDArrayType]): assert sorted(tensors1.keys()) == sorted(tensors2.keys()) for (k, val1) in tensors1.items(): assert (val1 == tensors2[k]).all() -def check_tensor_memory(cls: type, count: int, tensors: typing.Dict[str, cp.ndarray]): - other_tensors = {'ones': cp.ones(count), 'zeros': cp.zeros(count)} +def check_tensor_memory(cls: type, count: int, tensors: typing.Dict[str, NDArrayType], array_pkg: types.ModuleType): + other_tensors = {'ones': array_pkg.ones(count), 'zeros': array_pkg.zeros(count)} mem = cls(count=count) assert mem.count == count @@ -73,27 +73,43 @@ def check_tensor_memory(cls: type, count: int, tensors: typing.Dict[str, cp.ndar cls(count, tensors) -def test_tensor_memory(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +def check_response_memory_probs(cls: type, array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) + count = test_data.shape[0] + + mem = cls(count=count, probs=test_data) + assert mem.count == count + compare_tensors(mem.get_tensors(), {'probs': test_data}) + assert (mem.get_output('probs') == test_data).all() + + with pytest.raises(TypeError): + cls(count, test_data) + + return mem + + +@pytest.mark.gpu_and_cpu_mode +def test_tensor_memory(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] # TensorMemory expects a dictionary of { : } # Convert each column into a 1d cupy array tensors = {} for col in range(test_data.shape[1]): - tensors[string.ascii_lowercase[col]] = cp.array(test_data[:, col]) + tensors[string.ascii_lowercase[col]] = array_pkg.array(test_data[:, col]) for cls in (TensorMemory, InferenceMemory, ResponseMemory): - check_tensor_memory(cls, count, tensors) + check_tensor_memory(cls=cls, count=count, tensors=tensors, array_pkg=array_pkg) -@pytest.mark.use_python -def test_inference_memory_ae(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +@pytest.mark.gpu_and_cpu_mode +def test_inference_memory_ae(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] - input_tensor = cp.array(test_data[:, 0]) - seq_ids = cp.array(test_data[:, 1]) + input_tensor = array_pkg.array(test_data[:, 0]) + seq_ids = array_pkg.array(test_data[:, 1]) mem = InferenceMemoryAE(count=count, inputs=input_tensor, seq_ids=seq_ids) assert mem.count == count @@ -105,12 +121,13 @@ def test_inference_memory_ae(config: Config): InferenceMemoryAE(count, input_tensor, seq_ids) # pylint: disable=too-many-function-args,missing-kwoa -def test_inference_memory_fil(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +@pytest.mark.gpu_and_cpu_mode +def test_inference_memory_fil(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] - input_0 = cp.array(test_data[:, 0]) - seq_ids = cp.array(test_data[:, 1]) + input_0 = array_pkg.array(test_data[:, 0]) + seq_ids = array_pkg.array(test_data[:, 1]) mem = InferenceMemoryFIL(count=count, input__0=input_0, seq_ids=seq_ids) assert mem.count == count @@ -122,13 +139,14 @@ def test_inference_memory_fil(config: Config): InferenceMemoryFIL(count, input_0, seq_ids) # pylint: disable=too-many-function-args,missing-kwoa -def test_inference_memory_nlp(config: Config): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) +@pytest.mark.gpu_and_cpu_mode +def test_inference_memory_nlp(array_pkg: types.ModuleType): + test_data = array_pkg.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) count = test_data.shape[0] - input_ids = cp.array(test_data[:, 0]) - input_mask = cp.array(test_data[:, 1]) - seq_ids = cp.array(test_data[:, 2]) + input_ids = array_pkg.array(test_data[:, 0]) + input_mask = array_pkg.array(test_data[:, 1]) + seq_ids = array_pkg.array(test_data[:, 2]) mem = InferenceMemoryNLP(count=count, input_ids=input_ids, input_mask=input_mask, seq_ids=seq_ids) assert mem.count == count @@ -141,24 +159,9 @@ def test_inference_memory_nlp(config: Config): InferenceMemoryNLP(count, input_ids, input_mask, seq_ids) # pylint: disable=too-many-function-args,missing-kwoa -def check_response_memory_probs_and_ae(cls: type): - test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1)) - count = test_data.shape[0] - - mem = cls(count=count, probs=test_data) - assert mem.count == count - compare_tensors(mem.get_tensors(), {'probs': test_data}) - assert (mem.get_output('probs') == test_data).all() - - with pytest.raises(TypeError): - cls(count, test_data) - - return mem - - -@pytest.mark.use_python -def test_response_memory_ae(config: Config, filter_probs_df: DataFrameType): - mem = check_response_memory_probs_and_ae(ResponseMemoryAE) +@pytest.mark.gpu_and_cpu_mode +def test_response_memory_ae(array_pkg: types.ModuleType, filter_probs_df: DataFrameType): + mem = check_response_memory_probs(ResponseMemoryAE, array_pkg) assert mem.user_id == "" assert mem.explain_df is None @@ -170,38 +173,43 @@ def test_response_memory_ae(config: Config, filter_probs_df: DataFrameType): assert (mem.explain_df.values == filter_probs_df.values).all() -def test_response_memory_probs(config: Config): - check_response_memory_probs_and_ae(ResponseMemoryProbs) +@pytest.mark.gpu_and_cpu_mode +def test_response_memory_probs(array_pkg: types.ModuleType): + check_response_memory_probs(ResponseMemoryProbs, array_pkg) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) -def test_constructor_length_error(config: Config, tensor_cls: type): +def test_constructor_length_error(array_pkg: types.ModuleType, tensor_cls: type): count = 10 - tensors = {"a": cp.zeros(count), "b": cp.ones(count)} + tensors = {"a": array_pkg.zeros(count), "b": array_pkg.ones(count)} with pytest.raises(ValueError): tensor_cls(count=count - 1, tensors=tensors) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) -def test_set_tensor_length_error(config: Config, tensor_cls: type): +def test_set_tensor_length_error(array_pkg: types.ModuleType, tensor_cls: type): count = 10 mem = tensor_cls(count=count) with pytest.raises(ValueError): - mem.set_tensor('a', cp.zeros(count + 1)) + mem.set_tensor('a', array_pkg.zeros(count + 1)) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) -def test_set_tensors_length_error(config: Config, tensor_cls: type): +def test_set_tensors_length_error(array_pkg: types.ModuleType, tensor_cls: type): count = 10 - tensors = {"a": cp.zeros(count), "b": cp.ones(count)} + tensors = {"a": array_pkg.zeros(count), "b": array_pkg.ones(count)} mem = tensor_cls(count=count + 1) with pytest.raises(ValueError): mem.set_tensors(tensors) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory]) @pytest.mark.parametrize( "shape", @@ -209,12 +217,12 @@ def test_set_tensors_length_error(config: Config, tensor_cls: type): (536870912, 1), # bytesize > 2**31 (134217728, 4) # bytesize > 2**31 and element count > 2**31 ]) -def test_tensorindex_bug(config: Config, tensor_cls: type, shape: typing.Tuple[int, int]): +def test_tensorindex_bug(array_pkg: types.ModuleType, tensor_cls: type, shape: typing.Tuple[int, int]): """ Test for issue #1004. We use a 32bit signed integer for shape and strides, but we shouldn't for element counts and byte sizes. """ - tensors = {"a": cp.zeros(shape, dtype=np.float32)} + tensors = {"a": array_pkg.zeros(shape, dtype=np.float32)} mem = tensor_cls(count=shape[0], tensors=tensors) tensor_a = mem.get_tensor('a') @@ -222,19 +230,24 @@ def test_tensorindex_bug(config: Config, tensor_cls: type, shape: typing.Tuple[i assert tensor_a.nbytes == shape[0] * shape[1] * 4 -def test_tensor_update(config: Config): +@pytest.mark.gpu_and_cpu_mode +def test_tensor_update(array_pkg: types.ModuleType): tensor_data = { - "input_ids": cp.array([1, 2, 3]), "input_mask": cp.array([1, 1, 1]), "segment_ids": cp.array([0, 0, 1]) + "input_ids": array_pkg.array([1, 2, 3]), + "input_mask": array_pkg.array([1, 1, 1]), + "segment_ids": array_pkg.array([0, 0, 1]) } tensor_memory = TensorMemory(count=3, tensors=tensor_data) # Update tensors with new data new_tensors = { - "input_ids": cp.array([4, 5, 6]), "input_mask": cp.array([1, 0, 1]), "segment_ids": cp.array([1, 1, 0]) + "input_ids": array_pkg.array([4, 5, 6]), + "input_mask": array_pkg.array([1, 0, 1]), + "segment_ids": array_pkg.array([1, 1, 0]) } tensor_memory.set_tensors(new_tensors) for (key, cp_arr) in new_tensors.items(): tensor = tensor_memory.get_tensor(key) - cp.allclose(tensor, cp_arr) + array_pkg.allclose(tensor, cp_arr) diff --git a/tests/morpheus/modules/test_from_control_message.py b/tests/morpheus/modules/test_from_control_message.py index b129bbbcc8..514dc68234 100644 --- a/tests/morpheus/modules/test_from_control_message.py +++ b/tests/morpheus/modules/test_from_control_message.py @@ -71,7 +71,7 @@ def test_get_module(): fn_constructor("FromControlMessageTest", config) # pylint: disable=not-callable -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("filename, expected_count", [("train_infer.json", 0), ("train.json", 0)], indirect=["filename"]) def test_cm_with_no_payload(config, filename, expected_count): @@ -97,7 +97,7 @@ def test_cm_with_no_payload(config, filename, expected_count): assert len(sink_stage.get_messages()) == expected_count -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("filename, expected_count", [("train_infer.json", 2), ("train.json", 1)], indirect=["filename"]) def test_cm_with_with_payload(config, filename, expected_count): diff --git a/tests/morpheus/modules/test_payload_batcher.py b/tests/morpheus/modules/test_payload_batcher.py index 02acd6b8ee..8fa39b18a5 100644 --- a/tests/morpheus/modules/test_payload_batcher.py +++ b/tests/morpheus/modules/test_payload_batcher.py @@ -83,7 +83,7 @@ def test_get_module(): assert isinstance(module_instance, mrc.core.segment.SegmentModule) -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize( "max_batch_size, raise_on_failure, group_by_columns, disable_max_batch_size, timestamp_column_name, " "timestamp_pattern, period, expected_count, expected_exception", @@ -193,7 +193,7 @@ def test_custom_params(config, assert len(sink_stage.get_messages()) == expected_count -@pytest.mark.use_cpp +@pytest.mark.gpu_mode def test_default_params(config, filter_probs_df): pipe = Pipeline(config) diff --git a/tests/morpheus/modules/test_to_control_message.py b/tests/morpheus/modules/test_to_control_message.py index 96f91a2fee..ce2218b8aa 100644 --- a/tests/morpheus/modules/test_to_control_message.py +++ b/tests/morpheus/modules/test_to_control_message.py @@ -61,7 +61,7 @@ def test_get_module(): assert isinstance(module_instance, mrc.core.segment.SegmentModule) -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("expected_count", [1, 2]) def test_to_control_message_module(config, filter_probs_df, expected_count): dataframes = [filter_probs_df for _ in range(expected_count)] diff --git a/tests/morpheus/parsers/test_windows_event_parser.py b/tests/morpheus/parsers/test_windows_event_parser.py index c90207612e..f287abfcd2 100644 --- a/tests/morpheus/parsers/test_windows_event_parser.py +++ b/tests/morpheus/parsers/test_windows_event_parser.py @@ -630,6 +630,7 @@ def test_windows_event_parser(): test_logs = fh.readlines() test_input = cudf.Series(test_logs) test_output_df = wep.parse(test_input) + for parsed_rec in test_output_df.to_records(): eventcode = parsed_rec["eventcode"] validate_func = VALIDATE_DICT.get(eventcode, unknown_record_type) diff --git a/tests/morpheus/pipeline/test_error_pipe.py b/tests/morpheus/pipeline/test_error_pipe.py index 7f1e044286..cb264f2231 100755 --- a/tests/morpheus/pipeline/test_error_pipe.py +++ b/tests/morpheus/pipeline/test_error_pipe.py @@ -16,7 +16,6 @@ import logging -import pandas as pd import pytest from _utils.stages.error_raiser import ErrorRaiserStage @@ -26,10 +25,12 @@ from morpheus.stages.general.monitor_stage import MonitorStage from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage from morpheus.stages.output.in_memory_sink_stage import InMemorySinkStage +from morpheus.utils.type_aliases import DataFrameType +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("exception_cls", [RuntimeError, ValueError, NotImplementedError]) -def test_stage_raises_exception(config: Config, filter_probs_df: pd.DataFrame, exception_cls: type[Exception]): +def test_stage_raises_exception(config: Config, filter_probs_df: DataFrameType, exception_cls: type[Exception]): pipe = LinearPipeline(config) pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) error_raiser_stage = pipe.add_stage(ErrorRaiserStage(config, exception_cls=exception_cls)) @@ -43,7 +44,7 @@ def test_stage_raises_exception(config: Config, filter_probs_df: pd.DataFrame, e assert len(sink_stage.get_messages()) == 0 -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("delayed_start", [False, True]) def test_monitor_not_impl(config: Config, delayed_start: bool): diff --git a/tests/morpheus/pipeline/test_execution_modes.py b/tests/morpheus/pipeline/test_execution_modes.py new file mode 100755 index 0000000000..d740235a1b --- /dev/null +++ b/tests/morpheus/pipeline/test_execution_modes.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections.abc +import typing + +import mrc +import pytest +from mrc.core import operators as ops + +from _utils.stages.conv_msg import ConvMsg +from morpheus.config import Config +from morpheus.config import ExecutionMode +from morpheus.pipeline.execution_mode_mixins import CpuOnlyMixin +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin +from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin +from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.pipeline.stage_decorator import source +from morpheus.pipeline.stage_decorator import stage + + +@source +def gpu_only_source() -> collections.abc.Iterator[int]: + for i in range(10): + yield i + + +@source(execution_modes=(ExecutionMode.CPU, )) +def cpu_only_source() -> collections.abc.Iterator[int]: + for i in range(10): + yield i + + +@source(execution_modes=(ExecutionMode.CPU, ExecutionMode.GPU)) +def gpu_cpu_source() -> collections.abc.Iterator[int]: + for i in range(10): + yield i + + +@stage +def gpu_only_stage(message: typing.Any) -> typing.Any: + return message + + +@stage(execution_modes=(ExecutionMode.CPU, )) +def cpu_only_stage(message: typing.Any) -> typing.Any: + return message + + +@stage(execution_modes=(ExecutionMode.CPU, ExecutionMode.GPU)) +def gpu_cpu_stage(message: typing.Any) -> typing.Any: + return message + + +class BaseStage(PassThruTypeMixin, SinglePortStage): + + def accepted_types(self) -> typing.Tuple: + return (typing.Any, ) + + def supports_cpp_node(self) -> bool: + return False + + def on_data(self, data: typing.Any) -> typing.Any: + return data + + def _build_single(self, builder: mrc.Builder, input_node: mrc.SegmentObject) -> mrc.SegmentObject: + node = builder.make_node(self.unique_name, ops.map(self.on_data)) + builder.make_edge(input_node, node) + + return node + + +class CpuOnlyStage(CpuOnlyMixin, BaseStage): + + @property + def name(self) -> str: + return "test-cpu-only-stage" + + +class GpuOnlyStage(BaseStage): + + @property + def name(self) -> str: + return "test-gpu-only-stage" + + +class GpuAndCpuStage(GpuAndCpuMixin, BaseStage): + + @property + def name(self) -> str: + return "test-gpu-and-cpu-stage" + + +@pytest.mark.parametrize("stage_cls, expected_modes", + [ + (GpuOnlyStage, {ExecutionMode.GPU}), + (CpuOnlyStage, {ExecutionMode.CPU}), + (GpuAndCpuStage, {ExecutionMode.GPU, ExecutionMode.CPU}), + (gpu_only_source, {ExecutionMode.GPU}), + (cpu_only_source, {ExecutionMode.CPU}), + (gpu_cpu_source, {ExecutionMode.GPU, ExecutionMode.CPU}), + (gpu_only_stage, {ExecutionMode.GPU}), + (cpu_only_stage, {ExecutionMode.CPU}), + (gpu_cpu_stage, {ExecutionMode.GPU, ExecutionMode.CPU}), + ]) +def test_execution_mode_mixins(stage_cls: type[ConvMsg], expected_modes: set): + # intentionally not using the config fixture so that we can set the execution mode manually + config = Config() + if ExecutionMode.CPU in expected_modes: + config.execution_mode = ExecutionMode.CPU + else: + config.execution_mode = ExecutionMode.GPU + + stage_ = stage_cls(config) + assert set(stage_.supported_execution_modes()) == expected_modes + + +@pytest.mark.parametrize("stage_cls, execution_mode", + [ + (GpuOnlyStage, ExecutionMode.CPU), + (gpu_only_source, ExecutionMode.CPU), + (gpu_only_stage, ExecutionMode.CPU), + (CpuOnlyStage, ExecutionMode.GPU), + (cpu_only_source, ExecutionMode.GPU), + (cpu_only_stage, ExecutionMode.GPU), + ]) +def test_unsupported_mode_error(stage_cls: type[ConvMsg], execution_mode: ExecutionMode): + # intentionally not using the config fixture so that we can set the execution mode and avoid iterating over + # python/C++ execution modes + config = Config() + config.execution_mode = execution_mode + + with pytest.raises(RuntimeError, match="Unsupported execution mode"): + stage_ = stage_cls(config) + stage_._pre_build(do_propagate=False) diff --git a/tests/morpheus/pipeline/test_file_in_out.py b/tests/morpheus/pipeline/test_file_in_out.py index a99e649821..b61e496bec 100755 --- a/tests/morpheus/pipeline/test_file_in_out.py +++ b/tests/morpheus/pipeline/test_file_in_out.py @@ -41,6 +41,7 @@ @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_type", ["csv", "jsonlines", "parquet"]) @pytest.mark.parametrize("use_pathlib", [False, True]) @pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) @@ -91,6 +92,7 @@ def test_file_rw_pipe(tmp_path: pathlib.Path, assert output_data.tolist() == validation_data.tolist() +@pytest.mark.gpu_and_cpu_mode def test_file_read_json(config: Config): src_file = os.path.join(TEST_DIRS.tests_data_dir, "simple.json") @@ -110,7 +112,7 @@ def test_file_read_json(config: Config): @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode @pytest.mark.usefixtures("chdir_tmpdir") def test_to_file_no_path(tmp_path: pathlib.Path, config: Config): """ @@ -131,6 +133,7 @@ def test_to_file_no_path(tmp_path: pathlib.Path, config: Config): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_type", ["csv", "jsonlines", "parquet"]) @pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) def test_file_rw_multi_segment_pipe(tmp_path: pathlib.Path, config: Config, input_type: str, output_type: str): @@ -165,6 +168,7 @@ def test_file_rw_multi_segment_pipe(tmp_path: pathlib.Path, config: Config, inpu @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_file", [ os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv"), @@ -189,6 +193,7 @@ def test_file_rw_index_pipe(tmp_path: pathlib.Path, config: Config, input_file: assert output_data.tolist() == validation_data.tolist() +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_file,extra_kwargs", [(os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv"), { "include_header": True, "include_index_col": False @@ -196,7 +201,6 @@ def test_file_rw_index_pipe(tmp_path: pathlib.Path, config: Config, input_file: "include_header": True }), (os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.jsonlines"), {})], ids=["CSV", "CSV_ID", "JSON"]) -@pytest.mark.usefixtures("use_cpp") def test_file_roundtrip(tmp_path: pathlib.Path, input_file: str, extra_kwargs: dict[str, typing.Any]): # Output file should be same type as input @@ -235,6 +239,7 @@ def test_read_cpp_compare(input_file: str): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) def test_file_rw_serialize_deserialize_pipe(tmp_path: pathlib.Path, config: Config, output_type: str): input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv") diff --git a/tests/morpheus/pipeline/test_pipe_viz.py b/tests/morpheus/pipeline/test_pipe_viz.py index da2b245886..156496ab56 100755 --- a/tests/morpheus/pipeline/test_pipe_viz.py +++ b/tests/morpheus/pipeline/test_pipe_viz.py @@ -25,6 +25,7 @@ from _utils.dataset_manager import DatasetManager from _utils.stages.conv_msg import ConvMsg from morpheus.cli.commands import RANKDIR_CHOICES +from morpheus.config import Config from morpheus.pipeline import LinearPipeline from morpheus.pipeline.pipeline import Pipeline from morpheus.pipeline.pipeline import PipelineState @@ -35,10 +36,8 @@ from morpheus.stages.preprocess.deserialize_stage import DeserializeStage -# pylint: disable=redefined-outer-name -@pytest.mark.use_cudf @pytest.fixture(name="viz_pipeline", scope="function") -def viz_pipeline_fixture(config, filter_probs_df): +def viz_pipeline_fixture(config: Config, dataset_cudf: DatasetManager): """ Creates a quick pipeline. """ @@ -46,9 +45,9 @@ def viz_pipeline_fixture(config, filter_probs_df): config.num_threads = 1 pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) + pipe.set_source(InMemorySourceStage(config, [dataset_cudf["filter_probs.csv"]])) pipe.add_stage(DeserializeStage(config)) - pipe.add_stage(ConvMsg(config, filter_probs_df)) + pipe.add_stage(ConvMsg(config, dataset_cudf["filter_probs.csv"])) pipe.add_stage(AddClassificationsStage(config)) pipe.add_stage(SerializeStage(config, include=[f"^{c}$" for c in config.class_labels])) pipe.add_stage(InMemorySinkStage(config)) diff --git a/tests/morpheus/pipeline/test_pipeline.py b/tests/morpheus/pipeline/test_pipeline.py index aded507af6..56aa234ffd 100755 --- a/tests/morpheus/pipeline/test_pipeline.py +++ b/tests/morpheus/pipeline/test_pipeline.py @@ -38,7 +38,7 @@ from morpheus.utils.type_aliases import DataFrameType -class SourceTestStage(InMemorySourceStage): +class SourceTestStage(InMemorySourceStage): # pylint: disable=too-many-ancestors def __init__(self, config, diff --git a/tests/morpheus/pipeline/test_preallocation_pipe.py b/tests/morpheus/pipeline/test_preallocation_pipe.py index 53f85a46fc..f82eb97fe0 100755 --- a/tests/morpheus/pipeline/test_preallocation_pipe.py +++ b/tests/morpheus/pipeline/test_preallocation_pipe.py @@ -23,6 +23,7 @@ from _utils.stages.conv_msg import ConvMsg from morpheus.common import TypeId from morpheus.common import typeid_to_numpy_str +from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.pipeline import LinearPipeline @@ -32,10 +33,12 @@ from morpheus.stages.postprocess.add_scores_stage import AddScoresStage from morpheus.stages.postprocess.serialize_stage import SerializeStage from morpheus.stages.preprocess.deserialize_stage import DeserializeStage +from morpheus.utils.type_aliases import DataFrameType +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize('probs_type', [TypeId.FLOAT32, TypeId.FLOAT64]) -def test_preallocation(config, filter_probs_df, probs_type): +def test_preallocation(config: Config, filter_probs_df: DataFrameType, probs_type: TypeId): config.class_labels = ['frogs', 'lizards', 'toads', 'turtles'] probs_np_type = typeid_to_numpy_str(probs_type) expected_df = pd.DataFrame( @@ -61,8 +64,9 @@ def test_preallocation(config, filter_probs_df, probs_type): assert_results(comp_stage.get_results()) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize('probs_type', [TypeId.FLOAT32, TypeId.FLOAT64]) -def test_preallocation_multi_segment_pipe(config, filter_probs_df, probs_type): +def test_preallocation_multi_segment_pipe(config: Config, filter_probs_df: DataFrameType, probs_type: TypeId): """ Test ensures that when columns are needed for preallocation in a multi-segment pipeline, the preallocagtion will always be performed on the closest source to the stage that requested preallocation. Which in cases where the @@ -99,7 +103,7 @@ def test_preallocation_multi_segment_pipe(config, filter_probs_df, probs_type): assert_results(comp_stage.get_results()) -@pytest.mark.use_cpp +@pytest.mark.gpu_mode def test_preallocation_error(config, filter_probs_df): """ Verify that we get a raised exception when add_scores attempts to use columns that don't exist diff --git a/tests/morpheus/pipeline/test_stage_decorator.py b/tests/morpheus/pipeline/test_stage_decorator.py index 25b3095209..31a45d553d 100644 --- a/tests/morpheus/pipeline/test_stage_decorator.py +++ b/tests/morpheus/pipeline/test_stage_decorator.py @@ -30,6 +30,7 @@ from _utils import assert_results from morpheus.common import TypeId from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import MessageMeta from morpheus.pipeline import LinearPipeline from morpheus.pipeline.stage_decorator import ComputeSchemaType @@ -41,6 +42,7 @@ from morpheus.pipeline.stage_decorator import stage from morpheus.pipeline.stage_schema import StageSchema from morpheus.stages.output.compare_dataframe_stage import CompareDataFrameStage +from morpheus.utils.type_aliases import DataFrameType def _get_annotation(type_: type, generator_type: type) -> type: @@ -59,7 +61,6 @@ def _mk_compute_schema_fn(return_type: type) -> ComputeSchemaType: return lambda schema: schema.output_schema.set_type(return_type) -@pytest.mark.use_python @pytest.mark.parametrize("generator_type", [None, typing.Iterator, typing.Generator, collections.abc.Iterator, collections.abc.Generator]) @pytest.mark.parametrize("return_type, is_prealloc", [(pd.DataFrame, True), (cudf.DataFrame, True), (MessageMeta, True), @@ -96,7 +97,6 @@ def test_source_gen() -> return_annotation: mock_compute_schema_fn.assert_called_once_with(schema) -@pytest.mark.use_python @pytest.mark.parametrize("src_cls", [WrappedFunctionSourceStage, PreAllocatedWrappedFunctionStage]) def test_wrapped_function_source_stage_not_generator_error(config: Config, src_cls: type): @@ -110,7 +110,6 @@ def test_source_gen() -> MessageMeta: compute_schema_fn=_mk_compute_schema_fn(MessageMeta)) -@pytest.mark.use_python @pytest.mark.parametrize("generator_type", [None, typing.Iterator, typing.Generator, collections.abc.Iterator, collections.abc.Generator]) @pytest.mark.parametrize("return_type, is_prealloc", [(pd.DataFrame, True), (cudf.DataFrame, True), (MessageMeta, True), @@ -133,7 +132,6 @@ def test_source_gen() -> return_annotation: assert schema.output_schema.get_type() is return_type -@pytest.mark.use_python def test_source_decorator_name(config: Config): @source @@ -144,7 +142,6 @@ def test_source_gen(value: int) -> int: assert source_stage.name == 'test_source_gen' # pylint: disable=no-member -@pytest.mark.use_python def test_source_decorator_explicit_name(config: Config): @source(name="source_gen") @@ -155,7 +152,6 @@ def test_source_gen(value: int) -> int: assert source_stage.name == 'source_gen' # pylint: disable=no-member -@pytest.mark.use_python def test_source_decorator_explicit_compute_schema(config: Config): mock_compute_schema_fn = mock.MagicMock() mock_compute_schema_fn.side_effect = _mk_compute_schema_fn(int) @@ -171,7 +167,6 @@ def test_source_gen(value: int) -> int: mock_compute_schema_fn.assert_called_once_with(schema) -@pytest.mark.use_python def test_source_decorator_no_annoation_error(config: Config): @source @@ -182,7 +177,6 @@ def test_source_gen(): test_source_gen(config) # pylint: disable=too-many-function-args -@pytest.mark.use_python def test_not_generator_error(config: Config): @source @@ -193,7 +187,6 @@ def test_fn() -> int: test_fn(config) # pylint: disable=too-many-function-args -@pytest.mark.use_python def test_source_stage_arg_no_value_error(config: Config): @source @@ -204,7 +197,6 @@ def test_source_gen(value: int) -> int: test_source_gen(config) -@pytest.mark.use_python @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), (typing.Union[float, int], float), (float, typing.Any), (typing.Any, float), @@ -220,7 +212,6 @@ def test_wrapped_function_stage_constructor(config: Config, accept_type: type, r assert wrapped_stage.accepted_types() == (accept_type, ) -@pytest.mark.use_python @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), (typing.Union[float, int], float), (float, float), (typing.Any, float), @@ -256,7 +247,6 @@ def source_fn(): assert schema.output_schema.get_type() is return_type -@pytest.mark.use_python def test_wrapped_function_stage_name(config: Config): def multiplier(message: MessageMeta, column: str, value: int | float) -> MessageMeta: @@ -273,7 +263,6 @@ def multiplier(message: MessageMeta, column: str, value: int | float) -> Message assert wrapped_stage.name == 'multiplier' -@pytest.mark.use_python @pytest.mark.parametrize("needed_columns", [None, { 'result': TypeId.INT64 @@ -295,7 +284,6 @@ def test_fn(message: MessageMeta) -> MessageMeta: assert wrapped_stage._needed_columns == expected_needed_columns -@pytest.mark.use_python @pytest.mark.parametrize("use_accept_type_annotation", [True, False]) @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), @@ -320,7 +308,6 @@ def test_fn(message) -> return_type: assert wrapped_stage.accepted_types() == (accept_type, ) -@pytest.mark.use_python @pytest.mark.parametrize("name", [None, "unittest-stage"]) def test_stage_decorator_name(config: Config, name: str): if name is None: @@ -336,7 +323,6 @@ def test_fn(message: float, value: float) -> float: assert wrapped_stage.name == expected_name -@pytest.mark.use_python @pytest.mark.parametrize("explicit_compute_schema_fn", [True, False]) @pytest.mark.parametrize("accept_type, return_type", [(pd.DataFrame, MessageMeta), (int, int), (MessageMeta, MessageMeta), (typing.Any, bool), @@ -377,7 +363,6 @@ def test_stage(message: accept_type) -> return_type: assert schema.output_schema.get_type() is return_type -@pytest.mark.use_python def test_stage_decorator_no_annotation_error(config: Config): @stage @@ -388,7 +373,6 @@ def test_fn(message): test_fn(config) -@pytest.mark.use_python def test_stage_arg_no_value_error(config: Config): @stage @@ -399,7 +383,6 @@ def test_fn(message: float, value: float) -> float: test_fn(config) # pylint: disable=no-value-for-parameter -@pytest.mark.use_python @pytest.mark.parametrize("needed_columns", [None, { 'result': TypeId.INT64 @@ -417,15 +400,16 @@ def test_fn(message: MessageMeta) -> MessageMeta: assert wrapped_stage._needed_columns == expected_needed_columns -def test_end_to_end_pipe(config: Config, filter_probs_df: cudf.DataFrame): +@pytest.mark.gpu_and_cpu_mode +def test_end_to_end_pipe(config: Config, filter_probs_df: DataFrameType): - @source - def source_gen(dataframes: list[cudf.DataFrame]) -> collections.abc.Iterator[MessageMeta]: + @source(execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) + def source_gen(*, dataframes: list[DataFrameType]) -> collections.abc.Iterator[MessageMeta]: for df in dataframes: yield MessageMeta(df) - @stage - def multiplier(message: MessageMeta, column: str, value: int | float = 2.0) -> MessageMeta: + @stage(execution_modes=(ExecutionMode.GPU, ExecutionMode.CPU)) + def multiplier(message: MessageMeta, *, column: str, value: int | float = 2.0) -> MessageMeta: with message.mutable_dataframe() as df: df[column] = df[column] * value @@ -436,7 +420,7 @@ def multiplier(message: MessageMeta, column: str, value: int | float = 2.0) -> M expected_df['v2'] = expected_df['v2'] * multipy_by * 2.0 pipe = LinearPipeline(config) - pipe.set_source(source_gen(config, dataframes=[filter_probs_df])) # pylint: disable=redundant-keyword-arg + pipe.set_source(source_gen(config, dataframes=[filter_probs_df])) # pylint: disable=too-many-function-args pipe.add_stage(multiplier(config, column='v2', value=multipy_by)) pipe.add_stage(multiplier(config, column='v2')) sink = pipe.add_stage(CompareDataFrameStage(config, expected_df)) diff --git a/tests/morpheus/stages/test_add_classifications_stage.py b/tests/morpheus/stages/test_add_classifications_stage.py index 2966888238..98eff9e698 100755 --- a/tests/morpheus/stages/test_add_classifications_stage.py +++ b/tests/morpheus/stages/test_add_classifications_stage.py @@ -16,23 +16,21 @@ import typing -import cupy as cp +import numpy as np +import pandas as pd import pytest import typing_utils -import cudf - from _utils.dataset_manager import DatasetManager -# pylint: disable=morpheus-incorrect-lib-from-import -from morpheus._lib.messages import TensorMemory as CppTensorMemory from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage @pytest.fixture(name="config") -def config_fixture(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def config_fixture(config: Config): config.class_labels = ['frogs', 'lizards', 'toads'] yield config @@ -60,20 +58,20 @@ def test_constructor_errors(config: Config): AddClassificationsStage(config, labels=['missing']) -@pytest.mark.use_python -def test_add_labels(): +@pytest.mark.cpu_mode +def test_add_labels_with_contgrol_message(): class_labels = {0: "frogs", 1: "lizards", 2: "toads"} threshold = 0.6 - df = cudf.DataFrame([0, 1], columns=["dummy"]) - probs_array = cp.array([[0.1, 0.6, 0.8], [0.3, 0.61, 0.9]]) + df = pd.DataFrame([0, 1], columns=["dummy"]) + probs_array = np.array([[0.1, 0.6, 0.8], [0.3, 0.61, 0.9]]) probs_array_bool = probs_array > threshold cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(CppTensorMemory(count=2, tensors={"probs": probs_array})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array})) labeled_cm = AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=threshold) @@ -84,7 +82,7 @@ def test_add_labels(): # Too small of a probs array cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(CppTensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) with pytest.raises(RuntimeError): AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=threshold) diff --git a/tests/morpheus/stages/test_add_scores_stage.py b/tests/morpheus/stages/test_add_scores_stage.py index f00338c25a..8694632abe 100755 --- a/tests/morpheus/stages/test_add_scores_stage.py +++ b/tests/morpheus/stages/test_add_scores_stage.py @@ -16,23 +16,22 @@ import typing -import cupy as cp +import numpy as np +import pandas as pd import pytest import typing_utils -import cudf - -import morpheus._lib.messages as _messages from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.add_classifications_stage import AddClassificationsStage from morpheus.stages.postprocess.add_scores_stage import AddScoresStage @pytest.fixture(name='config') -def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def fixture_config(config: Config): config.class_labels = ['frogs', 'lizards', 'toads'] config.feature_length = 12 yield config @@ -61,16 +60,16 @@ def test_constructor_errors(config: Config): AddScoresStage(config, labels=['missing']) -@pytest.mark.use_python -def test_add_labels(): +@pytest.mark.cpu_mode +def test_add_labels_with_control_message(): class_labels = {0: "frogs", 1: "lizards", 2: "toads"} - df = cudf.DataFrame([0, 1], columns=["dummy"]) - probs_array = cp.array([[0.1, 0.5, 0.8], [0.2, 0.6, 0.9]]) + df = pd.DataFrame([0, 1], columns=["dummy"]) + probs_array = np.array([[0.1, 0.5, 0.8], [0.2, 0.6, 0.9]]) cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(_messages.TensorMemory(count=2, tensors={"probs": probs_array})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array})) labeled_cm = AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=None) @@ -81,7 +80,7 @@ def test_add_labels(): # Too small of a probs array cm = ControlMessage() cm.payload(MessageMeta(df)) - cm.tensors(_messages.TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) + cm.tensors(TensorMemory(count=2, tensors={"probs": probs_array[:, 0:-1]})) with pytest.raises(RuntimeError): AddClassificationsStage._add_labels(cm, idx2label=class_labels, threshold=None) diff --git a/tests/morpheus/stages/test_appshield_source_stage.py b/tests/morpheus/stages/test_appshield_source_stage.py index f69983b2ea..03920bef8b 100755 --- a/tests/morpheus/stages/test_appshield_source_stage.py +++ b/tests/morpheus/stages/test_appshield_source_stage.py @@ -23,7 +23,8 @@ from pandas.testing import assert_frame_equal from _utils import TEST_DIRS -from morpheus.messages.message_meta import AppShieldMessageMeta +from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.stages.input.appshield_source_stage import AppShieldSourceStage from morpheus.utils.directory_watcher import DirectoryWatcher @@ -279,7 +280,7 @@ def test_files_to_dfs(cols_include, cols_exclude, plugins_include, meta_columns, @pytest.mark.parametrize( 'input_df_per_source', [{ - 'appshield': [ + 'appshield': pd.DataFrame({ 'PID': pd.Series(['304', '304', '444', '350', '360', '563'], index=[0, 1, 3, 0, 1, 3]), @@ -290,8 +291,7 @@ def test_files_to_dfs(cols_include, cols_exclude, plugins_include, meta_columns, pd.Series(['appshield', 'appshield', 'appshield', 'appshield', 'appshield', 'appshield'], index=[0, 1, 3, 0, 1, 3]) }), - ], - 'appshield-v2': [ + 'appshield-v2': pd.DataFrame({ 'PID': pd.Series(['304', '304', '444', '350', '360', '563'], index=[0, 1, 3, 0, 1, 3]), @@ -303,11 +303,21 @@ def test_files_to_dfs(cols_include, cols_exclude, plugins_include, meta_columns, 'appshield-v2', 'appshield-v2', 'appshield-v2', 'appshield-v2', 'appshield-v2', 'appshield-v2' ], index=[0, 1, 3, 0, 1, 3]) - }), - ] + }) }]) -def test_build_metadata(input_df_per_source): - appshield_message_metas = AppShieldSourceStage._build_metadata(input_df_per_source) +def test_build_messages(config: Config, tmp_path: str, input_df_per_source: dict): + expected_sources = sorted(input_df_per_source.keys()) + + input_glob = os.path.join(tmp_path, '*.json') + # These constructor arguments are not used by the _build_messages method + stage = AppShieldSourceStage(config, input_glob, ['unused'], ['unused']) + appshield_messages = stage._build_messages(input_df_per_source) + + assert len(appshield_messages) == len(expected_sources) + + actual_sources = [] + for message in appshield_messages: + assert isinstance(message, ControlMessage) + actual_sources.append(message.get_metadata('source')) - assert len(appshield_message_metas) == 2 - assert isinstance(appshield_message_metas[0], AppShieldMessageMeta) + assert sorted(actual_sources) == expected_sources diff --git a/tests/morpheus/stages/test_deserialize_stage_pipe.py b/tests/morpheus/stages/test_deserialize_stage_pipe.py index e9d2f9e317..a8f656fa65 100755 --- a/tests/morpheus/stages/test_deserialize_stage_pipe.py +++ b/tests/morpheus/stages/test_deserialize_stage_pipe.py @@ -29,7 +29,7 @@ @pytest.mark.use_cudf -@pytest.mark.usefixtures("use_cpp") +@pytest.mark.gpu_mode def test_fixing_non_unique_indexes(dataset: DatasetManager): # Set 2 ids equal to others df = dataset.dup_index(dataset["filter_probs.csv"], count=2) diff --git a/tests/morpheus/stages/test_file_source_stage.py b/tests/morpheus/stages/test_file_source_stage.py new file mode 100755 index 0000000000..19d2dacd51 --- /dev/null +++ b/tests/morpheus/stages/test_file_source_stage.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from _utils import TEST_DIRS +from morpheus.config import Config +from morpheus.config import ExecutionMode +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin +from morpheus.stages.input.file_source_stage import FileSourceStage + + +def test_execution_modes(config: Config): + assert issubclass(FileSourceStage, GpuAndCpuMixin) + stage = FileSourceStage(config, filename=os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv")) + + # we don't care about the order of the execution modes + assert set(stage.supported_execution_modes()) == {ExecutionMode.GPU, ExecutionMode.CPU} diff --git a/tests/morpheus/stages/test_file_source_stage_pipe.py b/tests/morpheus/stages/test_file_source_stage_pipe.py index 59f9c76d63..0f5c1fdb2e 100755 --- a/tests/morpheus/stages/test_file_source_stage_pipe.py +++ b/tests/morpheus/stages/test_file_source_stage_pipe.py @@ -25,6 +25,7 @@ from morpheus.common import FileTypes from morpheus.common import determine_file_type from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.io.deserializers import read_file_to_df from morpheus.pipeline import LinearPipeline from morpheus.stages.input.file_source_stage import FileSourceStage @@ -32,6 +33,7 @@ @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("input_file", [ os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv"), @@ -46,7 +48,7 @@ def test_file_source_stage_pipe(config: Config, input_file: str, filter_null: bo parser_kwargs = {} if determine_file_type(input_file) == FileTypes.JSON: # kwarg specific to pandas.read_json - parser_kwargs['convert_dates'] = False + parser_kwargs['convert_dates'] = config.execution_mode == ExecutionMode.CPU expected_df = read_file_to_df(file_name=input_file, filter_nulls=filter_null, diff --git a/tests/morpheus/stages/test_filter_detections_stage.py b/tests/morpheus/stages/test_filter_detections_stage.py index cd7f361007..d0bd75d622 100644 --- a/tests/morpheus/stages/test_filter_detections_stage.py +++ b/tests/morpheus/stages/test_filter_detections_stage.py @@ -16,13 +16,13 @@ import typing -import cupy as cp +import numpy as np import pytest import typing_utils -import morpheus._lib.messages as _messages from morpheus.common import FilterSource from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.filter_detections_stage import FilterDetectionsStage @@ -31,7 +31,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) return cm @@ -45,11 +45,11 @@ def test_constructor(config): assert typing_utils.issubtype(ControlMessage, accepted_union) -@pytest.mark.use_cudf +@pytest.mark.use_pandas def test_filter_copy(config, filter_probs_df): fds = FilterDetectionsStage(config, threshold=0.5, filter_source=FilterSource.TENSOR) - probs = cp.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) + probs = np.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) mock_control_message = _make_control_message(filter_probs_df, probs) # All values are at or below the threshold so nothing should be returned @@ -57,7 +57,7 @@ def test_filter_copy(config, filter_probs_df): assert output_control_message is None # Only one row has a value above the threshold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.5, 0.8], [0.2, 0.4, 0.3], @@ -65,11 +65,11 @@ def test_filter_copy(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 1:1, :].to_cupy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 1:1, :].to_numpy().tolist() # Two adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -79,11 +79,11 @@ def test_filter_copy(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 2:3, :].to_cupy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 2:3, :].to_numpy().tolist() # Two non-adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -92,17 +92,17 @@ def test_filter_copy(config, filter_probs_df): [0.2, 0.4, 0.3], ]) - mask = cp.zeros(len(filter_probs_df), dtype=cp.bool_) + mask = np.zeros(len(filter_probs_df), dtype=np.bool_) mask[2] = True mask[4] = True mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - mask, :].to_cupy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + mask, :].to_numpy().tolist() -@pytest.mark.use_cudf +@pytest.mark.use_pandas @pytest.mark.parametrize('do_copy', [True, False]) @pytest.mark.parametrize('threshold', [0.1, 0.5, 0.8]) @pytest.mark.parametrize('field_name', ['v1', 'v2', 'v3', 'v4']) @@ -112,22 +112,19 @@ def test_filter_column(config, filter_probs_df, do_copy, threshold, field_name): copy=do_copy, filter_source=FilterSource.DATAFRAME, field_name=field_name) - expected_df = filter_probs_df.to_pandas() - expected_df = expected_df[expected_df[field_name] > threshold] + expected_df = filter_probs_df[filter_probs_df[field_name] > threshold] - probs = cp.zeros([len(filter_probs_df), 3], 'float') - - # All values are at or below the threshold + probs = np.zeros([len(filter_probs_df), 3], 'float') mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_copy(mock_control_message) - assert output_control_message.payload().get_data().to_cupy().tolist() == expected_df.to_numpy().tolist() + assert output_control_message.payload().get_data().to_numpy().tolist() == expected_df.to_numpy().tolist() -@pytest.mark.use_cudf +@pytest.mark.use_pandas def test_filter_slice(config, filter_probs_df): fds = FilterDetectionsStage(config, threshold=0.5, filter_source=FilterSource.TENSOR) - probs = cp.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) + probs = np.array([[0.1, 0.5, 0.3], [0.2, 0.3, 0.4]]) # All values are at or below the threshold @@ -136,7 +133,7 @@ def test_filter_slice(config, filter_probs_df): assert len(output_control_message) == 0 # Only one row has a value above the threshold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.5, 0.8], [0.2, 0.4, 0.3], @@ -144,12 +141,11 @@ def test_filter_slice(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_slice(mock_control_message) - assert len(output_control_message) == 1 - assert output_control_message[0].payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 1:1, :].to_cupy().tolist() + assert output_control_message[0].payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 1:1, :].to_numpy().tolist() # Two adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -159,12 +155,11 @@ def test_filter_slice(config, filter_probs_df): mock_control_message = _make_control_message(filter_probs_df, probs) output_control_message = fds._controller.filter_slice(mock_control_message) - assert len(output_control_message) == 1 - assert output_control_message[0].payload().get_data().to_cupy().tolist() == filter_probs_df.loc[ - 2:3, :].to_cupy().tolist() + assert output_control_message[0].payload().get_data().to_numpy().tolist() == filter_probs_df.loc[ + 2:3, :].to_numpy().tolist() # Two non-adjacent rows have a value above the threashold - probs = cp.array([ + probs = np.array([ [0.2, 0.4, 0.3], [0.1, 0.2, 0.3], [0.1, 0.5, 0.8], @@ -180,5 +175,5 @@ def test_filter_slice(config, filter_probs_df): assert control_msg1.payload().count == 1 assert control_msg2.payload().count == 1 - assert control_msg1.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[2:2, :].to_cupy().tolist() - assert control_msg2.payload().get_data().to_cupy().tolist() == filter_probs_df.loc[4:4, :].to_cupy().tolist() + assert control_msg1.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[2:2, :].to_numpy().tolist() + assert control_msg2.payload().get_data().to_numpy().tolist() == filter_probs_df.loc[4:4, :].to_numpy().tolist() diff --git a/tests/morpheus/stages/test_filter_detections_stage_pipe.py b/tests/morpheus/stages/test_filter_detections_stage_pipe.py index f2d6e7dcdb..72a6e1fb7c 100755 --- a/tests/morpheus/stages/test_filter_detections_stage_pipe.py +++ b/tests/morpheus/stages/test_filter_detections_stage_pipe.py @@ -71,13 +71,13 @@ def _test_filter_detections_stage_pipe(config: Config, def _test_filter_detections_control_message_stage_multi_segment_pipe(config: Config, - dataset_pandas: DatasetManager, + dataset: DatasetManager, copy: bool = True): threshold = 0.75 - input_df = dataset_pandas["filter_probs.csv"] + input_df = dataset["filter_probs.csv"] pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [cudf.DataFrame(input_df)])) + pipe.set_source(InMemorySourceStage(config, [input_df])) pipe.add_segment_boundary(MessageMeta) pipe.add_stage(DeserializeStage(config)) pipe.add_segment_boundary(data_type=ControlMessage) @@ -87,8 +87,7 @@ def _test_filter_detections_control_message_stage_multi_segment_pipe(config: Con pipe.add_segment_boundary(ControlMessage) pipe.add_stage(SerializeStage(config)) pipe.add_segment_boundary(MessageMeta) - comp_stage = pipe.add_stage( - CompareDataFrameStage(config, build_expected(dataset_pandas["filter_probs.csv"], threshold))) + comp_stage = pipe.add_stage(CompareDataFrameStage(config, build_expected(dataset["filter_probs.csv"], threshold))) pipe.run() assert_results(comp_stage.get_results()) @@ -108,6 +107,7 @@ def test_filter_detections_stage_pipe(config: Config, return _test_filter_detections_stage_pipe(config, dataset_pandas, do_copy, order, pipeline_batch_size, repeat) +@pytest.mark.slow @pytest.mark.parametrize('do_copy', [True, False]) def test_filter_detections_control_message_stage_multi_segment_pipe(config: Config, dataset_pandas: DatasetManager, diff --git a/tests/morpheus/stages/test_generate_viz_frames_stage.py b/tests/morpheus/stages/test_generate_viz_frames_stage.py index 879220d204..77da125263 100644 --- a/tests/morpheus/stages/test_generate_viz_frames_stage.py +++ b/tests/morpheus/stages/test_generate_viz_frames_stage.py @@ -21,10 +21,10 @@ import cudf -import morpheus._lib.messages as _messages from morpheus.config import Config from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory from morpheus.stages.postprocess.generate_viz_frames_stage import GenerateVizFramesStage @@ -32,7 +32,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) return cm diff --git a/tests/morpheus/stages/test_http_server_sink_stage.py b/tests/morpheus/stages/test_http_server_sink_stage.py index 9702ec1dd5..1f9359dbf2 100644 --- a/tests/morpheus/stages/test_http_server_sink_stage.py +++ b/tests/morpheus/stages/test_http_server_sink_stage.py @@ -89,7 +89,7 @@ def _custom_serializer(df: DataFrameType) -> str: @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode @pytest.mark.parametrize("lines", [False, True]) @pytest.mark.parametrize("max_rows_per_response", [10000, 10]) @pytest.mark.parametrize("df_serializer_fn", [None, _custom_serializer]) diff --git a/tests/morpheus/stages/test_http_server_source_stage.py b/tests/morpheus/stages/test_http_server_source_stage.py index 268d6bbc29..b98c931eab 100644 --- a/tests/morpheus/stages/test_http_server_source_stage.py +++ b/tests/morpheus/stages/test_http_server_source_stage.py @@ -58,7 +58,7 @@ def join(self, timeout=None): @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.parametrize("lines", [False, True], ids=["json", "lines"]) @pytest.mark.parametrize("use_payload_to_df_fn", [False, True], ids=["no_payload_to_df_fn", "payload_to_df_fn"]) def test_generate_frames(config: Config, @@ -99,6 +99,9 @@ def test_generate_frames(config: Config, lines=lines, payload_to_df_fn=payload_to_df_fn) + if not use_payload_to_df_fn: + stage._set_default_payload_to_df_fn() + generate_frames = stage._generate_frames(mock_subscription) msg_queue = queue.SimpleQueue() @@ -155,7 +158,7 @@ def test_constructor_invalid_accept_status(config: Config, invalid_accept_status @pytest.mark.slow -@pytest.mark.use_python +@pytest.mark.cpu_mode @pytest.mark.parametrize( "lines", [False, pytest.param(True, marks=pytest.mark.skip(reason="https://github.com/rapidsai/cudf/issues/15820"))], diff --git a/tests/morpheus/stages/test_inference_stage.py b/tests/morpheus/stages/test_inference_stage.py index 262f84b3e3..10370210a1 100755 --- a/tests/morpheus/stages/test_inference_stage.py +++ b/tests/morpheus/stages/test_inference_stage.py @@ -22,9 +22,10 @@ import cudf -import morpheus._lib.messages as _messages from _utils.inference_worker import IW from morpheus.messages import ControlMessage +from morpheus.messages import InferenceMemory +from morpheus.messages import ResponseMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.inference.inference_stage import InferenceStage @@ -45,12 +46,11 @@ def _mk_control_message(mess_count=1, count=1): msg = ControlMessage() msg.payload(MessageMeta(df)) msg.tensors( - _messages.InferenceMemory( - count=total_tensor_count, - tensors={ - "probs": cp.random.rand(total_tensor_count, 2), - "seq_ids": cp.tile(cp.expand_dims(cp.arange(0, total_tensor_count), axis=1), (1, 3)) - })) + InferenceMemory(count=total_tensor_count, + tensors={ + "probs": cp.random.rand(total_tensor_count, 2), + "seq_ids": cp.tile(cp.expand_dims(cp.arange(0, total_tensor_count), axis=1), (1, 3)) + })) return msg @@ -95,14 +95,14 @@ def test_join(config): worker.join.assert_awaited_once() -@pytest.mark.use_python +@pytest.mark.gpu_mode def test_convert_one_response(): # Test ControlMessage # Test first branch where `inf.mess_count == inf.count` - mem = _messages.ResponseMemory(count=4, tensors={"probs": cp.zeros((4, 3))}) + mem = ResponseMemory(count=4, tensors={"probs": cp.zeros((4, 3))}) inf = _mk_control_message(mess_count=4, count=4) - res = _messages.ResponseMemory(count=4, tensors={"probs": cp.random.rand(4, 3)}) + res = ResponseMemory(count=4, tensors={"probs": cp.random.rand(4, 3)}) output = _mk_control_message(mess_count=4, count=4) output.tensors(mem) @@ -115,10 +115,9 @@ def test_convert_one_response(): # Test for the second branch inf = _mk_control_message(mess_count=2, count=3) inf.tensors().set_tensor("seq_ids", cp.array([[0], [1], [1]])) - res = _messages.ResponseMemory(count=3, - tensors={"probs": cp.array([[0, 0.6, 0.7], [5.6, 4.4, 9.2], [4.5, 6.7, 8.9]])}) + res = ResponseMemory(count=3, tensors={"probs": cp.array([[0, 0.6, 0.7], [5.6, 4.4, 9.2], [4.5, 6.7, 8.9]])}) - mem = _messages.ResponseMemory(count=2, tensors={"probs": cp.zeros((2, 3))}) + mem = ResponseMemory(count=2, tensors={"probs": cp.zeros((2, 3))}) output = _mk_control_message(mess_count=2, count=3) output.tensors(mem) cm = InferenceStageT._convert_one_response(output, inf, res) diff --git a/tests/morpheus/stages/test_kafka_source_stage_pipe.py b/tests/morpheus/stages/test_kafka_source_stage_pipe.py index cb5adda659..92d93a6c6a 100644 --- a/tests/morpheus/stages/test_kafka_source_stage_pipe.py +++ b/tests/morpheus/stages/test_kafka_source_stage_pipe.py @@ -39,6 +39,7 @@ from kafka import KafkaConsumer +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka def test_kafka_source_stage_pipe(config: Config, kafka_bootstrap_servers: str, kafka_topics: KafkaTopics) -> None: input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.jsonlines") @@ -63,6 +64,7 @@ def test_kafka_source_stage_pipe(config: Config, kafka_bootstrap_servers: str, k assert_results(comp_stage.get_results()) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka def test_multi_topic_kafka_source_stage_pipe(config: Config, kafka_bootstrap_servers: str) -> None: input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.jsonlines") @@ -95,6 +97,7 @@ def test_multi_topic_kafka_source_stage_pipe(config: Config, kafka_bootstrap_ser assert_results(comp_stage.get_results()) +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka @pytest.mark.parametrize('async_commits', [True, False]) @pytest.mark.parametrize('num_records', [10, 100, 1000]) @@ -150,6 +153,7 @@ def test_kafka_source_commit(num_records: int, assert actual_offset == expected_offset +@pytest.mark.gpu_and_cpu_mode @pytest.mark.kafka @pytest.mark.parametrize('num_records', [1000]) def test_kafka_source_batch_pipe(config: Config, diff --git a/tests/morpheus/stages/test_linear_modules_stage.py b/tests/morpheus/stages/test_linear_modules_stage.py index 8209b96b6e..b0c16de59d 100755 --- a/tests/morpheus/stages/test_linear_modules_stage.py +++ b/tests/morpheus/stages/test_linear_modules_stage.py @@ -23,13 +23,14 @@ from morpheus.stages.general.linear_modules_stage import LinearModulesStage from morpheus.utils.module_utils import mrc_version -module_config = { - "module_id": "TestSimpleModule", "module_name": "test_simple_module", "namespace": "test_morpheus_modules" -} +@pytest.fixture(name="module_config") +def module_config_fixture(): + return {"module_id": "TestSimpleModule", "module_name": "test_simple_module", "namespace": "test_morpheus_modules"} -@pytest.mark.use_python -def test_constructor(config): + +@pytest.mark.gpu_and_cpu_mode +def test_constructor(config, module_config: dict): mod_stage = LinearModulesStage(config, module_config, input_port_name="test_in", output_port_name="test_out") @@ -44,8 +45,8 @@ def test_constructor(config): pytest.raises(NotImplementedError, mod_stage._get_cpp_module_node, None) -@pytest.mark.use_python -def test_build_single_before_module_registration(config): +@pytest.mark.gpu_and_cpu_mode +def test_build_single_before_module_registration(config, module_config: dict): mock_node = mock.MagicMock() mock_segment = mock.MagicMock() @@ -61,19 +62,20 @@ def test_build_single_before_module_registration(config): mod_stage._build_single(mock_segment, mock_input_stream) -def register_test_module(): +def register_test_module(id_postfix: str): registry = mrc.ModuleRegistry def module_init_fn(_: mrc.Builder): pass - registry.register_module("TestSimpleModule", "test_morpheus_modules", mrc_version, module_init_fn) + registry.register_module(f"TestSimpleModule_{id_postfix}", "test_morpheus_modules", mrc_version, module_init_fn) -@pytest.mark.use_python -def test_build_single_after_module_registration(config): +@pytest.mark.gpu_and_cpu_mode +def test_build_single_after_module_registration(config, module_config: dict): - register_test_module() + register_test_module(config.execution_mode.value) + module_config["module_id"] = f"{module_config['module_id']}_{config.execution_mode.value}" mock_node = mock.MagicMock() mock_segment = mock.MagicMock() diff --git a/tests/morpheus/stages/test_ml_flow_drift_stage.py b/tests/morpheus/stages/test_ml_flow_drift_stage.py index f5eaca0229..ff0f4d2a92 100644 --- a/tests/morpheus/stages/test_ml_flow_drift_stage.py +++ b/tests/morpheus/stages/test_ml_flow_drift_stage.py @@ -21,9 +21,9 @@ import pytest import typing_utils -import morpheus._lib.messages as _messages from morpheus.messages import ControlMessage -from morpheus.messages.message_meta import MessageMeta +from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory from morpheus.stages.postprocess.ml_flow_drift_stage import MLFlowDriftStage @@ -31,7 +31,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) return cm @@ -46,7 +46,6 @@ def test_constructor(config): @pytest.mark.use_cudf -@pytest.mark.use_python def test_calc_drift(config, filter_probs_df): with patch("morpheus.stages.postprocess.ml_flow_drift_stage.mlflow.start_run"): labels = ["a", "b", "c"] diff --git a/tests/morpheus/stages/test_monitor_stage.py b/tests/morpheus/stages/test_monitor_stage.py index b6ff56c6b4..e50153e7e5 100755 --- a/tests/morpheus/stages/test_monitor_stage.py +++ b/tests/morpheus/stages/test_monitor_stage.py @@ -179,7 +179,7 @@ def test_log_level(mock_progress_sink: mock.MagicMock, assert mock_sink_on_completed.call_count == expected_call_count -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode def test_thread(config: Config, morpheus_log_level: int): """ Test ensures the monitor stage executes on the same thread as the parent stage diff --git a/tests/morpheus/stages/test_multi_port_modules_stage.py b/tests/morpheus/stages/test_multi_port_modules_stage.py index 31ac032546..ca4e3f35ec 100755 --- a/tests/morpheus/stages/test_multi_port_modules_stage.py +++ b/tests/morpheus/stages/test_multi_port_modules_stage.py @@ -51,7 +51,7 @@ def registered_module_conf(): yield registered_module_conf -@pytest.mark.use_python +@pytest.mark.gpu_and_cpu_mode def test_constructor(config, unregistered_module_conf): mod_stage = MultiPortModulesStage(config, diff --git a/tests/morpheus/stages/test_multi_processing_stage.py b/tests/morpheus/stages/test_multi_processing_stage.py index 470c27c1f8..f88ec3d7d8 100644 --- a/tests/morpheus/stages/test_multi_processing_stage.py +++ b/tests/morpheus/stages/test_multi_processing_stage.py @@ -26,6 +26,7 @@ from _utils import assert_results from _utils.dataset_manager import DatasetManager from morpheus.config import Config +from morpheus.config import ExecutionMode from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.pipeline import LinearPipeline @@ -54,6 +55,7 @@ def _process_df(df: pd.DataFrame, column: str, value: str) -> pd.DataFrame: return df +@pytest.mark.gpu_and_cpu_mode def test_create_stage_type_deduction(config: Config, dataset_pandas: DatasetManager): # Test create() with normal function @@ -110,26 +112,39 @@ def __init__(self, self._add_column_name = add_column_name self._shared_process_pool.set_usage(self.name, self._process_pool_usage) + self._execution_mode = c.execution_mode @property def name(self) -> str: return "derived-multi-processing-stage" + def supported_execution_modes(self) -> tuple[ExecutionMode]: + """ + Returns a tuple of supported execution modes of this stage. + """ + return (ExecutionMode.GPU, ExecutionMode.CPU) + def _on_data(self, data: ControlMessage) -> ControlMessage: input_df = data.payload().copy_dataframe() - pdf = input_df.to_pandas() + if self._execution_mode == ExecutionMode.GPU: + input_df = input_df.to_pandas() + partial_process_fn = partial(_process_df, column=self._add_column_name, value="Hello") - task = self._shared_process_pool.submit_task(self.name, partial_process_fn, pdf) + task = self._shared_process_pool.submit_task(self.name, partial_process_fn, input_df) + + df = task.result() + if self._execution_mode == ExecutionMode.GPU: + df = cudf.DataFrame.from_pandas(df) - df = cudf.DataFrame.from_pandas(task.result()) meta = MessageMeta(df) data.payload(meta) return data +@pytest.mark.gpu_and_cpu_mode def test_derived_stage_type_deduction(config: Config): mp_stage = DerivedMultiProcessingStage(c=config, process_pool_usage=0.1, add_column_name="new_column") @@ -142,13 +157,12 @@ def test_derived_stage_type_deduction(config: Config): def pandas_dataframe_generator(dataset_pandas: DatasetManager, count: int) -> Generator[pd.DataFrame, None, None]: - - df = dataset_pandas["csv_sample.csv"] for _ in range(count): - yield df + yield dataset_pandas["csv_sample.csv"] @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode def test_created_stage_pipe(config: Config, dataset_pandas: DatasetManager): config.num_threads = os.cpu_count() @@ -178,6 +192,7 @@ def test_created_stage_pipe(config: Config, dataset_pandas: DatasetManager): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode def test_derived_stage_pipe(config: Config, dataset_pandas: DatasetManager): config.num_threads = os.cpu_count() @@ -188,7 +203,7 @@ def test_derived_stage_pipe(config: Config, dataset_pandas: DatasetManager): expected_df[add_column_name] = "Hello" pipe = LinearPipeline(config) - pipe.set_source(InMemorySourceStage(config, [cudf.DataFrame(input_df)])) + pipe.set_source(InMemorySourceStage(config, [input_df])) pipe.add_stage(DeserializeStage(config, ensure_sliceable_index=True)) pipe.add_stage(DerivedMultiProcessingStage(c=config, process_pool_usage=0.1, add_column_name=add_column_name)) pipe.add_stage(SerializeStage(config)) @@ -200,6 +215,7 @@ def test_derived_stage_pipe(config: Config, dataset_pandas: DatasetManager): @pytest.mark.slow +@pytest.mark.gpu_and_cpu_mode def test_multiple_stages_pipe(config: Config, dataset_pandas: DatasetManager): config.num_threads = os.cpu_count() @@ -214,9 +230,8 @@ def test_multiple_stages_pipe(config: Config, dataset_pandas: DatasetManager): partial_fn = partial(_process_df, column="new_column_1", value="new_value") - @stage - def pdf_to_control_message_stage(pdf: pd.DataFrame) -> ControlMessage: - df = cudf.DataFrame.from_pandas(pdf) + @stage(execution_modes=(ExecutionMode.CPU, ExecutionMode.GPU)) + def pdf_to_control_message_stage(df: pd.DataFrame) -> ControlMessage: meta = MessageMeta(df) msg = ControlMessage() msg.payload(meta) diff --git a/tests/morpheus/stages/test_preprocess_fil_stage.py b/tests/morpheus/stages/test_preprocess_fil_stage.py index cb9c5045be..57456e804b 100644 --- a/tests/morpheus/stages/test_preprocess_fil_stage.py +++ b/tests/morpheus/stages/test_preprocess_fil_stage.py @@ -15,21 +15,17 @@ import typing -import cupy as cp import pytest import typing_utils -import cudf - from morpheus.config import Config from morpheus.config import ConfigFIL from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta from morpheus.stages.preprocess.preprocess_fil_stage import PreprocessFILStage @pytest.fixture(name='config') -def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def fixture_config(config: Config): config.feature_length = 1 config.fil = ConfigFIL() config.fil.feature_columns = ["data"] @@ -44,18 +40,3 @@ def test_constructor(config: Config): accepted_union = typing.Union[stage.accepted_types()] assert typing_utils.issubtype(ControlMessage, accepted_union) - - -def test_process_control_message(config: Config): - stage = PreprocessFILStage(config) - input_cm = ControlMessage() - df = cudf.DataFrame({"data": [1, 2, 3]}) - meta = MessageMeta(df) - input_cm.payload(meta) - - output_cm = stage.pre_process_batch(input_cm, stage._fea_length, stage.features) - assert cp.array_equal(output_cm.tensors().get_tensor("input__0"), cp.asarray(df.to_cupy())) - expect_seq_ids = cp.zeros((df.shape[0], 3), dtype=cp.uint32) - expect_seq_ids[:, 0] = cp.arange(0, df.shape[0], dtype=cp.uint32) - expect_seq_ids[:, 2] = stage._fea_length - 1 - assert cp.array_equal(output_cm.tensors().get_tensor("seq_ids"), expect_seq_ids) diff --git a/tests/morpheus/stages/test_preprocess_nlp_stage.py b/tests/morpheus/stages/test_preprocess_nlp_stage.py index 764f5e94c9..1672768cae 100644 --- a/tests/morpheus/stages/test_preprocess_nlp_stage.py +++ b/tests/morpheus/stages/test_preprocess_nlp_stage.py @@ -14,23 +14,17 @@ # limitations under the License. import typing -from unittest.mock import Mock -from unittest.mock import patch -import cupy as cp import pytest import typing_utils -import cudf - from morpheus.config import Config from morpheus.messages import ControlMessage -from morpheus.messages import MessageMeta from morpheus.stages.preprocess.preprocess_nlp_stage import PreprocessNLPStage @pytest.fixture(name='config') -def fixture_config(config: Config, use_cpp: bool): # pylint: disable=unused-argument +def fixture_config(config: Config): config.class_labels = [ "address", "bank_acct", @@ -64,31 +58,3 @@ def test_constructor(config: Config): accepted_union = typing.Union[stage.accepted_types()] assert typing_utils.issubtype(ControlMessage, accepted_union) - - -@patch("morpheus.stages.preprocess.preprocess_nlp_stage.tokenize_text_series") -def test_process_control_message(mock_tokenize_text_series, config: Config): - mock_tokenized = Mock() - mock_tokenized.input_ids = cp.array([[1, 2], [1, 2]]) - mock_tokenized.input_mask = cp.array([[3, 4], [3, 4]]) - mock_tokenized.segment_ids = cp.array([[0, 0], [1, 1]]) - mock_tokenize_text_series.return_value = mock_tokenized - - stage = PreprocessNLPStage(config) - input_cm = ControlMessage() - df = cudf.DataFrame({"data": ["a", "b", "c"]}) - meta = MessageMeta(df) - input_cm.payload(meta) - - output_cm = stage.pre_process_batch(input_cm, - stage._vocab_hash_file, - stage._do_lower_case, - stage._seq_length, - stage._stride, - stage._truncation, - stage._add_special_tokens, - stage._column) - assert output_cm.get_metadata("inference_memory_params") == {"inference_type": "nlp"} - assert cp.array_equal(output_cm.tensors().get_tensor("input_ids"), mock_tokenized.input_ids) - assert cp.array_equal(output_cm.tensors().get_tensor("input_mask"), mock_tokenized.input_mask) - assert cp.array_equal(output_cm.tensors().get_tensor("seq_ids"), mock_tokenized.segment_ids) diff --git a/tests/morpheus/stages/test_rss_source_stage_pipe.py b/tests/morpheus/stages/test_rss_source_stage_pipe.py index ab5a3f0951..84beb5d636 100644 --- a/tests/morpheus/stages/test_rss_source_stage_pipe.py +++ b/tests/morpheus/stages/test_rss_source_stage_pipe.py @@ -28,15 +28,13 @@ invalid_feed_input = os.path.join(TEST_DIRS.tests_data_dir, "rss_feed_atom.xm") -@pytest.mark.use_python def test_support_cpp_node(config): url_feed_input = "https://fake.nvidia.com/rss/HomePage.xml" - rss_source_stage = RSSSourceStage(config, feed_input=url_feed_input) + rss_source_stage = RSSSourceStage(config, feed_input=[url_feed_input]) assert rss_source_stage.supports_cpp_node() is False -@pytest.mark.use_python @pytest.mark.parametrize( "feed_input, batch_size, expected_count, enable_cache", [([valid_feed_input], 30, 1, False), ([valid_feed_input], 12, 3, True), @@ -61,9 +59,7 @@ def test_rss_source_stage_pipe(config: Config, assert len(sink_stage.get_messages()) == expected_count -# TODO(Devin): Remove before merge, this isn't a stage test, this is a test of RSSController -# @pytest.mark.use_python -# def test_invalid_input_rss_source_stage(config: Config): -# -# with pytest.raises(ValueError, match=f"Invalid URL or file path: {invalid_feed_input}"): -# RSSSourceStage(config, feed_input=[invalid_feed_input], interval_secs=1, cooldown_interval=500) +def test_invalid_input_rss_source_stage(config: Config): + + with pytest.raises(ValueError, match=f"Invalid URL or file path: {invalid_feed_input}"): + RSSSourceStage(config, feed_input=[invalid_feed_input], interval_secs=1, cooldown_interval=500) diff --git a/tests/morpheus/stages/test_serialize_stage.py b/tests/morpheus/stages/test_serialize_stage.py index 850950d4c0..73420668cb 100755 --- a/tests/morpheus/stages/test_serialize_stage.py +++ b/tests/morpheus/stages/test_serialize_stage.py @@ -16,25 +16,28 @@ import re +import pandas as pd import pytest -import cudf - from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta from morpheus.stages.postprocess.serialize_stage import SerializeStage -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_fixed_columns(config): - df1 = cudf.DataFrame() + """ + The serialize stage works in both GPU and CPU mode, however this test is only for CPU mode since it is testing the + CPU implementation of the stage. + """ + df1 = pd.DataFrame() df1['apples'] = range(0, 4) df1['pears'] = range(5, 9) df1['apple_sauce'] = range(4, 0, -1) cm1 = ControlMessage() cm1.payload(MessageMeta(df1)) - df2 = cudf.DataFrame() + df2 = pd.DataFrame() df2['apples'] = range(4, 7) df2['applause'] = range(9, 6, -1) df2['pears'] = range(7, 10) diff --git a/tests/morpheus/stages/test_timeseries_stage.py b/tests/morpheus/stages/test_timeseries_stage.py index 51bca65c06..981eaab104 100644 --- a/tests/morpheus/stages/test_timeseries_stage.py +++ b/tests/morpheus/stages/test_timeseries_stage.py @@ -21,10 +21,10 @@ import pytest import typing_utils -import morpheus._lib.messages as _messages from morpheus.config import Config from morpheus.config import ConfigAutoEncoder from morpheus.messages import ControlMessage +from morpheus.messages import TensorMemory from morpheus.messages.message_meta import MessageMeta from morpheus.stages.postprocess.timeseries_stage import TimeSeriesStage @@ -42,7 +42,7 @@ def _make_control_message(df, probs): df_ = df[0:len(probs)] cm = ControlMessage() cm.payload(MessageMeta(df_)) - cm.tensors(_messages.TensorMemory(count=len(df_), tensors={'probs': probs})) + cm.tensors(TensorMemory(count=len(df_), tensors={'probs': probs})) cm.set_metadata("user_id", "test_user_id") return cm @@ -56,8 +56,7 @@ def test_constructor(config): assert typing_utils.issubtype(ControlMessage, accepted_union) -@pytest.mark.use_cudf -@pytest.mark.use_python +@pytest.mark.cpu_mode def test_call_timeseries_user(config): stage = TimeSeriesStage(config) diff --git a/tests/morpheus/stages/test_triton_inference_stage.py b/tests/morpheus/stages/test_triton_inference_stage.py index 09dfaafb8a..abbd7ab262 100644 --- a/tests/morpheus/stages/test_triton_inference_stage.py +++ b/tests/morpheus/stages/test_triton_inference_stage.py @@ -25,6 +25,7 @@ from _utils import assert_results from _utils import mk_async_infer +from morpheus.common import TypeId from morpheus.config import Config from morpheus.config import ConfigFIL from morpheus.config import PipelineModes @@ -122,16 +123,16 @@ def test_resource_pool_create_raises_error(): assert pool.borrow_obj() == 20 -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.parametrize("pipeline_mode", list(PipelineModes)) def test_stage_constructor_worker_class(config: Config, pipeline_mode: PipelineModes): config.mode = pipeline_mode - stage = TritonInferenceStage(config, model_name='test', server_url='test:0000') + stage = TritonInferenceStage(config, model_name='test', server_url='test:0000', use_shared_memory=True) worker = stage._get_inference_worker(ProducerConsumerQueue()) assert isinstance(worker, TritonInferenceWorker) -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.parametrize("pipeline_mode", list(PipelineModes)) @pytest.mark.parametrize("needs_logits", [True, False, None]) def test_stage_get_inference_worker(config: Config, pipeline_mode: PipelineModes, needs_logits: bool | None): @@ -142,7 +143,11 @@ def test_stage_get_inference_worker(config: Config, pipeline_mode: PipelineModes config.mode = pipeline_mode - stage = TritonInferenceStage(config, model_name='test', server_url='test:0000', needs_logits=needs_logits) + stage = TritonInferenceStage(config, + model_name='test', + server_url='test:0000', + needs_logits=needs_logits, + use_shared_memory=True) worker = stage._get_inference_worker(ProducerConsumerQueue()) assert isinstance(worker, TritonInferenceWorker) @@ -150,8 +155,7 @@ def test_stage_get_inference_worker(config: Config, pipeline_mode: PipelineModes @pytest.mark.slow -@pytest.mark.use_python -# @pytest.mark.parametrize('num_records', [1000, 2000, 4000]) +@pytest.mark.gpu_mode @pytest.mark.parametrize('num_records', [10]) @mock.patch('tritonclient.grpc.InferenceServerClient') def test_triton_stage_pipe(mock_triton_client, config, num_records): @@ -196,8 +200,13 @@ def test_triton_stage_pipe(mock_triton_client, config, num_records): pipe_cm.add_stage(DeserializeStage(config)) pipe_cm.add_stage(PreprocessFILStage(config)) pipe_cm.add_stage( - TritonInferenceStage(config, model_name='abp-nvsmi-xgb', server_url='test:0000', force_convert_inputs=True)) - pipe_cm.add_stage(AddScoresStage(config, prefix="score_")) + # Intentionally using use_shared_memory=True as this is the only way to use the Python impl + TritonInferenceStage(config, + model_name='abp-nvsmi-xgb', + server_url='test:0000', + force_convert_inputs=True, + use_shared_memory=True)) + pipe_cm.add_stage(AddScoresStage(config, prefix="score_", probs_type=TypeId.FLOAT64)) pipe_cm.add_stage(SerializeStage(config)) comp_stage = pipe_cm.add_stage(CompareDataFrameStage(config, expected_df)) diff --git a/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py b/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py index 199ff8319d..d63dbee4ca 100644 --- a/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py +++ b/tests/morpheus/stages/test_write_to_elasticsearch_stage_pipe.py @@ -17,16 +17,14 @@ import typing from unittest.mock import patch -import pandas as pd import pytest import yaml -import cudf - from morpheus.config import Config from morpheus.pipeline.linear_pipeline import LinearPipeline from morpheus.stages.input.in_memory_source_stage import InMemorySourceStage from morpheus.stages.output.write_to_elasticsearch_stage import WriteToElasticsearchStage +from morpheus.utils.type_aliases import DataFrameType def connection_kwargs_func(kwargs): @@ -47,7 +45,6 @@ def connection_conf_file_fixture(tmp_path): yield connection_conf_file -@pytest.mark.use_python @pytest.mark.parametrize("conf_file, exception", [("connection_conf.yaml", FileNotFoundError), (None, Exception)]) def test_constructor_invalid_conf_file(config: Config, conf_file: str, @@ -56,7 +53,6 @@ def test_constructor_invalid_conf_file(config: Config, WriteToElasticsearchStage(config, index="t_index", connection_conf_file=conf_file) -@pytest.mark.use_python @patch("morpheus.controllers.elasticsearch_controller.Elasticsearch") def test_constructor_with_custom_func(config: Config, connection_conf_file: str): expected_connection_kwargs = { @@ -73,12 +69,12 @@ def test_constructor_with_custom_func(config: Config, connection_conf_file: str) assert stage._controller._connection_kwargs == expected_connection_kwargs -@pytest.mark.use_python +@pytest.mark.use_cudf @patch("morpheus.stages.output.write_to_elasticsearch_stage.ElasticsearchController") def test_write_to_elasticsearch_stage_pipe(mock_controller: typing.Any, connection_conf_file: str, config: Config, - filter_probs_df: typing.Union[cudf.DataFrame, pd.DataFrame]): + filter_probs_df: DataFrameType): mock_df_to_parallel_bulk_write = mock_controller.return_value.df_to_parallel_bulk_write mock_refresh_client = mock_controller.return_value.refresh_client @@ -92,14 +88,11 @@ def test_write_to_elasticsearch_stage_pipe(mock_controller: typing.Any, # Run the pipeline pipe.run() - if isinstance(filter_probs_df, cudf.DataFrame): - filter_probs_df = filter_probs_df.to_pandas() - expected_index = mock_df_to_parallel_bulk_write.call_args[1]["index"] - expected_df = mock_df_to_parallel_bulk_write.call_args[1]["df"] + actual_df = mock_df_to_parallel_bulk_write.call_args[1]["df"] mock_refresh_client.assert_called_once() mock_df_to_parallel_bulk_write.assert_called_once() assert expected_index == "t_index" - assert expected_df.equals(filter_probs_df) + assert actual_df.equals(filter_probs_df.to_pandas()) diff --git a/tests/morpheus/stages/test_write_to_file_stage.py b/tests/morpheus/stages/test_write_to_file_stage.py deleted file mode 100755 index 002d3d4808..0000000000 --- a/tests/morpheus/stages/test_write_to_file_stage.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python -# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from unittest import mock - -import pytest - -from _utils import TEST_DIRS -from morpheus.config import Config -from morpheus.pipeline import LinearPipeline -from morpheus.stages.input.file_source_stage import FileSourceStage -from morpheus.stages.output.write_to_file_stage import WriteToFileStage - - -@pytest.mark.use_python -@pytest.mark.parametrize("flush", [False, True]) -@pytest.mark.parametrize("output_type", ["csv", "json", "jsonlines"]) -def test_file_rw_pipe(tmp_path: str, config: Config, output_type: str, flush: bool): - """ - Test the flush functionality of the WriteToFileStage. - """ - input_file = os.path.join(TEST_DIRS.tests_data_dir, "filter_probs.csv") - out_file = os.path.join(tmp_path, f'results.{output_type}') - - # This currently works because the FileSourceStage doesn't use the builtin open function, but WriteToFileStage does - mock_open = mock.mock_open() - with mock.patch('builtins.open', mock_open): - pipe = LinearPipeline(config) - pipe.set_source(FileSourceStage(config, filename=input_file)) - pipe.add_stage(WriteToFileStage(config, filename=out_file, overwrite=False, flush=flush)) - pipe.run() - - assert not os.path.exists(out_file) - assert mock_open().flush.called == flush diff --git a/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py b/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py index 94b17a196c..56f9a7dcff 100644 --- a/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py +++ b/tests/morpheus/stages/test_write_to_kafka_stage_pipe.py @@ -14,12 +14,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import types import typing import pytest -import cudf - from _utils.dataset_manager import DatasetManager from _utils.kafka import KafkaTopics from morpheus.pipeline.linear_pipeline import LinearPipeline @@ -33,9 +32,10 @@ @pytest.mark.kafka -@pytest.mark.use_cudf +@pytest.mark.gpu_and_cpu_mode def test_write_to_kafka_stage_pipe(config, - dataset_cudf: DatasetManager, + df_pkg: types.ModuleType, + dataset: DatasetManager, kafka_bootstrap_servers: str, kafka_consumer: "KafkaConsumer", kafka_topics: KafkaTopics) -> None: @@ -44,7 +44,7 @@ def test_write_to_kafka_stage_pipe(config, to ensure it works just as well with the C++ impls of the message classes. """ - filter_probs_df = dataset_cudf['filter_probs.csv'] + filter_probs_df = dataset['filter_probs.csv'] pipe = LinearPipeline(config) pipe.set_source(InMemorySourceStage(config, [filter_probs_df])) pipe.add_stage(DeserializeStage(config)) @@ -59,9 +59,8 @@ def test_write_to_kafka_stage_pipe(config, kafka_messages = list(kafka_consumer) assert len(kafka_messages) == len(filter_probs_df) - output_df = cudf.io.read_json("\n".join(rec.value.decode("utf-8") for rec in kafka_messages), - lines=True).to_pandas() + output_df = df_pkg.read_json("\n".join(rec.value.decode("utf-8") for rec in kafka_messages), lines=True) assert len(output_df) == len(filter_probs_df) - dataset_cudf.assert_compare_df(filter_probs_df, output_df) + dataset.assert_compare_df(filter_probs_df, output_df) diff --git a/tests/morpheus/test_cli.py b/tests/morpheus/test_cli.py index d97a94c8f2..dfb8a18844 100755 --- a/tests/morpheus/test_cli.py +++ b/tests/morpheus/test_cli.py @@ -134,7 +134,6 @@ def config_warning_fixture(): @pytest.mark.reload_modules(commands) @pytest.mark.usefixtures("chdir_tmpdir", "reload_modules") -@pytest.mark.use_python class TestCLI: @pytest.mark.parametrize( diff --git a/tests/morpheus/test_config.py b/tests/morpheus/test_config.py index 9817b5ab09..f958ce2f8b 100755 --- a/tests/morpheus/test_config.py +++ b/tests/morpheus/test_config.py @@ -17,6 +17,7 @@ import json import logging import os +from dataclasses import FrozenInstanceError from unittest import mock import pytest @@ -109,6 +110,50 @@ def test_to_string(config): assert isinstance(json.loads(conf_str), dict) +def test_frozen(config: morpheus.config.Config): + assert not config.frozen + + # Ensure that it is safe to call freeze() multiple times + for _ in range(2): + config.freeze() + assert config.frozen + + +@pytest.mark.parametrize('use_attr', [False, True]) +def test_frozen_immutible(config: morpheus.config.Config, use_attr: bool): + """ + Test for the freeze functionality. + + There are currently two ways to bypass the freeze functionality: + 1. By accessing the __dict__ attribute of the Config object. + 2. Modifying any of the mutable objects in the Config object (ex: `config.class_labels.append('new_label')`). + """ + assert not config.frozen + + # ensure that we can set some attributes + config.feature_length = 45 + + # freeze the config, freezing the config via the attribute or method should have the same effect, the only + # difference is that it is safe to call freeze() multiple times, while setting the attribute will raise an exception + # just like attempting to set any other attribute on a frozen object + if use_attr: + config.frozen = True + else: + config.freeze() + + assert config.frozen + + with pytest.raises(FrozenInstanceError): + config.feature_length = 100 + + # ensure setattr also raises an exception + with pytest.raises(FrozenInstanceError): + setattr(config, 'feature_length', 100) + + # ensure the config still has the original value + assert config.feature_length == 45 + + def test_warning_model_batch_size_less_than_pipeline_batch_size(caplog: pytest.LogCaptureFixture): config = morpheus.config.Config() config.pipeline_batch_size = 256 diff --git a/tests/morpheus/utils/test_column_info.py b/tests/morpheus/utils/test_column_info.py index f117ca9d9f..fe147c6218 100644 --- a/tests/morpheus/utils/test_column_info.py +++ b/tests/morpheus/utils/test_column_info.py @@ -24,8 +24,6 @@ import pandas as pd import pytest -import cudf - from _utils import TEST_DIRS from morpheus.io.deserializers import read_file_to_df from morpheus.utils.column_info import ColumnInfo @@ -52,14 +50,6 @@ def azure_ad_logs_pdf_fixture(_azure_ad_logs_pdf: pd.DataFrame): yield _azure_ad_logs_pdf.copy(deep=True) -@pytest.fixture(name="azure_ad_logs_cdf", scope="function") -def azure_ad_logs_cdf_fixture(_azure_ad_logs_pdf: pd.DataFrame): - # cudf.from_pandas essentially does a deep copy, so we can use this to ensure that the source pandas df is not - # modified - yield cudf.from_pandas(_azure_ad_logs_pdf) - - -@pytest.mark.use_python def test_dataframe_input_schema_without_json_cols(azure_ad_logs_pdf: pd.DataFrame): assert len(azure_ad_logs_pdf.columns) == 16 @@ -106,7 +96,6 @@ def test_dataframe_input_schema_without_json_cols(azure_ad_logs_pdf: pd.DataFram process_dataframe(azure_ad_logs_pdf, schema2) -@pytest.mark.use_python def test_string_cat_column(): cities = pd.Series([ "New York", @@ -156,7 +145,6 @@ def test_string_cat_column(): assert actual.equals(expected) -@pytest.mark.use_python def test_string_join_column(): cities = pd.Series([ "Boston", @@ -175,7 +163,6 @@ def test_string_join_column(): assert actual.equals(expected) -@pytest.mark.use_python def test_column_info(): cities = pd.Series([ "Boston", @@ -193,7 +180,6 @@ def test_column_info(): assert string_join_col.name == "city" -@pytest.mark.use_python def test_date_column(): time_series = pd.Series([ "2022-08-29T21:21:41.645157Z", @@ -212,7 +198,6 @@ def test_date_column(): assert datetime_series.dtype == np.dtype("datetime64[ns]") -@pytest.mark.use_python def test_rename_column(): time_series = pd.Series([ "2022-08-29T21:21:41.645157Z", @@ -235,7 +220,6 @@ def convert_to_upper(df, column_name: str): return df[column_name].str.upper() -@pytest.mark.use_python def test_custom_column(): cities = pd.Series([ "New York", @@ -256,7 +240,6 @@ def test_custom_column(): assert actutal.equals(expected) -@pytest.mark.use_python def test_type_cast(): """ Test reproduces issue reported in #922 diff --git a/tests/morpheus/utils/test_directory_watcher.py b/tests/morpheus/utils/test_directory_watcher.py index d7943bfb29..cc7f3dcccd 100644 --- a/tests/morpheus/utils/test_directory_watcher.py +++ b/tests/morpheus/utils/test_directory_watcher.py @@ -22,7 +22,6 @@ from morpheus.utils.directory_watcher import DirectoryWatcher -@pytest.mark.use_python @pytest.mark.parametrize('watch_directory', [True]) @pytest.mark.parametrize('max_files', [-1]) @pytest.mark.parametrize('sort_glob', [True]) diff --git a/tests/morpheus/utils/test_inference_worker.py b/tests/morpheus/utils/test_inference_worker.py index 22af7bff23..cfbefe821c 100755 --- a/tests/morpheus/utils/test_inference_worker.py +++ b/tests/morpheus/utils/test_inference_worker.py @@ -19,10 +19,10 @@ import cudf -import morpheus._lib.messages as _messages from _utils.inference_worker import IW from morpheus.messages import ControlMessage from morpheus.messages import MessageMeta +from morpheus.messages import TensorMemory from morpheus.stages.inference import inference_stage from morpheus.utils.producer_consumer_queue import ProducerConsumerQueue @@ -37,7 +37,7 @@ def test_constructor(): worker.stop() -@pytest.mark.use_python +@pytest.mark.gpu_mode @pytest.mark.usefixtures("config") def test_build_output_message(): @@ -58,7 +58,7 @@ def test_build_output_message(): input__0 = cp.array([[0.], [2.], [4.], [6.], [8.], [10.], [12.], [14.], [16.], [18.]]) seq_ids = cp.array([[0, 0, 0], [1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 0], [8, 0, 0], [9, 0, 0]]) - msg.tensors(_messages.TensorMemory(count=num_records, tensors={'input__0': input__0, 'seq_ids': seq_ids})) + msg.tensors(TensorMemory(count=num_records, tensors={'input__0': input__0, 'seq_ids': seq_ids})) output_message = worker.build_output_message(msg) diff --git a/tests/morpheus/utils/test_module_utils.py b/tests/morpheus/utils/test_module_utils.py index adcdc3e660..baf8027a9e 100644 --- a/tests/morpheus/utils/test_module_utils.py +++ b/tests/morpheus/utils/test_module_utils.py @@ -28,7 +28,6 @@ # pylint: disable=unused-argument,too-many-function-args -@pytest.mark.use_python def test_mrc_version(): assert len(mrc_version) == 3 assert isinstance(mrc_version, list) diff --git a/tests/morpheus/utils/test_type_utils.py b/tests/morpheus/utils/test_type_utils.py new file mode 100644 index 0000000000..ab06f39fcb --- /dev/null +++ b/tests/morpheus/utils/test_type_utils.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import types +import typing + +import pandas as pd +import pytest + +import cudf + +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_utils import df_type_str_to_exec_mode +from morpheus.utils.type_utils import df_type_str_to_pkg +from morpheus.utils.type_utils import exec_mode_to_df_type_str +from morpheus.utils.type_utils import get_df_class +from morpheus.utils.type_utils import get_df_pkg +from morpheus.utils.type_utils import is_cudf_type + + +@pytest.mark.parametrize("mode, expected", + [(ExecutionMode.GPU, cudf.DataFrame), (ExecutionMode.CPU, pd.DataFrame), + ("cudf", cudf.DataFrame), ("pandas", pd.DataFrame)]) +def test_get_df_class(mode: typing.Union[ExecutionMode, DataFrameModule], expected: types.ModuleType): + assert get_df_class(mode) is expected + + +@pytest.mark.parametrize("mode, expected", [(ExecutionMode.GPU, cudf), (ExecutionMode.CPU, pd), ("cudf", cudf), + ("pandas", pd)]) +def test_get_df_pkg(mode: typing.Union[ExecutionMode, DataFrameModule], expected: types.ModuleType): + assert get_df_pkg(mode) is expected + + +@pytest.mark.parametrize( + "obj, expected", + [ + (cudf.DataFrame(), True), + (cudf.Series(), True), + (cudf.Index([]), True), + (cudf.RangeIndex(0), True), + (pd.DataFrame(), False), + (pd.Series(), False), + (pd.Index([]), False), + (pd.RangeIndex(0), False), + (None, False), + (0, False), + ("test", False), + ], + ids=[ + "cudf.DataFrame", + "cudf.Series", + "cudf.Index", + "cudf.RangeIndex", + "pd.DataFrame", + "pd.Series", + "pd.Index", + "pd.RangeIndex", + "None", + "int", + "str" + ], +) +def test_is_cudf_type(obj: typing.Any, expected: bool): + assert is_cudf_type(obj) == expected + + +@pytest.mark.parametrize("df_type_str, expected", [("cudf", cudf), ("pandas", pd)], ids=["cudf", "pandas"]) +def test_df_type_str_to_pkg(df_type_str: DataFrameModule, expected: types.ModuleType): + assert df_type_str_to_pkg(df_type_str) is expected + + +@pytest.mark.parametrize("invalid_type_str", ["invalid", "cuDF", "Pandas"]) +def test_df_type_str_to_pkg_invalid(invalid_type_str: typing.Any): + with pytest.raises(ValueError, match="Invalid DataFrame type string"): + df_type_str_to_pkg(invalid_type_str) + + +@pytest.mark.parametrize("df_type_str, expected", [("cudf", ExecutionMode.GPU), ("pandas", ExecutionMode.CPU)], + ids=["cudf", "pandas"]) +def test_df_type_str_to_exec_mode(df_type_str: DataFrameModule, expected: ExecutionMode): + assert df_type_str_to_exec_mode(df_type_str) == expected + + +@pytest.mark.parametrize("invalid_type_str", ["invalid", "cuDF", "Pandas"]) +def test_df_type_str_to_exec_mode_invalid(invalid_type_str: typing.Any): + with pytest.raises(ValueError, match="Invalid DataFrame type string"): + df_type_str_to_exec_mode(invalid_type_str) + + +@pytest.mark.parametrize("exec_mode, expected", [(ExecutionMode.GPU, "cudf"), (ExecutionMode.CPU, "pandas")], + ids=["GPU", "CPU"]) +def test_exec_mode_to_df_type_str(exec_mode: ExecutionMode, expected: DataFrameModule): + assert exec_mode_to_df_type_str(exec_mode) == expected diff --git a/tests/morpheus_dfp/conftest.py b/tests/morpheus_dfp/conftest.py index 4609e4ceee..3bdc85bd39 100644 --- a/tests/morpheus_dfp/conftest.py +++ b/tests/morpheus_dfp/conftest.py @@ -60,12 +60,11 @@ def ae_feature_cols_fixture(): @pytest.fixture(name="config") -def config_fixture(config_no_cpp: Config, ae_feature_cols: typing.List[str]): +def config_fixture(config: Config, ae_feature_cols: typing.List[str]): """ The digital_fingerprinting production example utilizes the Auto Encoder config, and requires C++ execution disabled. """ from morpheus.config import ConfigAutoEncoder - config = config_no_cpp config.ae = ConfigAutoEncoder() config.ae.feature_columns = ae_feature_cols yield config @@ -87,25 +86,22 @@ def dfp_prod_in_sys_path( sys.path.append(example_dir) -@pytest.fixture(name="dfp_message_meta") -def dfp_message_meta_fixture(config, dataset_pandas: DatasetManager): - import pandas as pd +@pytest.fixture +def control_message(config, dataset_cudf: DatasetManager): + import cudf - from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta + from morpheus.messages import ControlMessage + from morpheus.messages import MessageMeta user_id = 'test_user' - df = dataset_pandas['filter_probs.csv'] - df[config.ae.timestamp_column_name] = pd.to_datetime([1683054498 + i for i in range(0, len(df) * 30, 30)], unit='s') + df = dataset_cudf['filter_probs.csv'] + timestamps = [1683054498 + i for i in range(0, len(df) * 30, 30)] + df[config.ae.timestamp_column_name] = cudf.to_datetime(timestamps, unit='s') df[config.ae.userid_column_name] = user_id - yield DFPMessageMeta(df, user_id) - -@pytest.fixture -def control_message(dfp_message_meta): - from morpheus.messages import ControlMessage message = ControlMessage() - message.payload(dfp_message_meta) - message.set_metadata("user_id", dfp_message_meta.user_id) + message.payload(MessageMeta(df)) + message.set_metadata("user_id", user_id) message.set_metadata("model", mock.MagicMock()) yield message diff --git a/tests/morpheus_dfp/modules/test_dfp_training.py b/tests/morpheus_dfp/modules/test_dfp_training.py index e4683c1ea2..4408e3bd15 100644 --- a/tests/morpheus_dfp/modules/test_dfp_training.py +++ b/tests/morpheus_dfp/modules/test_dfp_training.py @@ -21,6 +21,8 @@ from _utils import TEST_DIRS from _utils.dataset_manager import DatasetManager from morpheus.config import Config +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage @@ -50,8 +52,6 @@ def test_on_data(mock_train_test_split: mock.MagicMock, config: Config, dataset_pandas: DatasetManager, validation_size: float): - from morpheus.messages import ControlMessage - from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.stages.dfp_training import DFPTraining mock_ae.return_value = mock_ae @@ -63,10 +63,9 @@ def test_on_data(mock_train_test_split: mock.MagicMock, mock_validation_df = mock.MagicMock() mock_train_test_split.return_value = (train_df, mock_validation_df) - meta = DFPMessageMeta(df, 'Account-123456789') msg = ControlMessage() - msg.payload(meta) - msg.set_metadata("user_id", meta.user_id) + msg.payload(MessageMeta(df)) + msg.set_metadata("user_id", 'Account-123456789') stage = DFPTraining(config, validation_size=validation_size) results = stage.on_data(msg) diff --git a/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py b/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py index b39e05a03d..39dcfd7d6b 100644 --- a/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py +++ b/tests/morpheus_dfp/stages/test_dfp_mlflow_model_writer.py @@ -27,6 +27,7 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta from morpheus.pipeline.single_port_stage import SinglePortStage MockedRequests = namedtuple("MockedRequests", ["get", "patch", "response"]) @@ -238,7 +239,6 @@ def test_on_data( databricks_env: dict, databricks_permissions: dict, tracking_uri: str): - from morpheus_dfp.messages.dfp_message_meta import DFPMessageMeta from morpheus_dfp.stages.dfp_mlflow_model_writer import DFPMLFlowModelWriterStage from morpheus_dfp.stages.dfp_mlflow_model_writer import conda_env @@ -272,11 +272,10 @@ def test_on_data( mock_model.prepare_df.return_value = df mock_model.get_anomaly_score.return_value = pd.Series(float(i) for i in range(len(df))) - meta = DFPMessageMeta(df, 'Account-123456789') msg = ControlMessage() - msg.payload(meta) + msg.payload(MessageMeta(df)) msg.set_metadata("model", mock_model) - msg.set_metadata("user_id", meta.user_id) + msg.set_metadata("user_id", 'Account-123456789') stage = DFPMLFlowModelWriterStage(config, databricks_permissions=databricks_permissions, timeout=10) assert stage._controller.on_data(msg) is msg # Should be a pass-thru diff --git a/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py b/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py index b8f7e8cd18..06d142f91c 100644 --- a/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py +++ b/tests/morpheus_dfp/stages/test_dfp_rolling_window_stage.py @@ -21,9 +21,15 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config +from morpheus.messages import ControlMessage from morpheus.pipeline.single_port_stage import SinglePortStage +@pytest.fixture(name="train_df") +def train_df_fixture(control_message: ControlMessage) -> pd.DataFrame: + return control_message.payload().copy_dataframe().to_pandas() + + def build_mock_user_cache(user_id: str = 'test_user', train_df: pd.DataFrame = None, count: int = 10, @@ -81,109 +87,91 @@ def test_get_user_cache_miss(config: Config): assert results2 is results -def test_build_window_no_new( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_no_new(config: Config, control_message: ControlMessage): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') mock_cache = build_mock_user_cache() mock_cache.append_dataframe.return_value = False - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache - assert stage._build_window(dfp_message_meta) is None + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache + assert stage._build_window(control_message) is None -def test_build_window_not_enough_data( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_not_enough_data(config: Config, control_message: ControlMessage): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') mock_cache = build_mock_user_cache(count=3) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache - assert stage._build_window(dfp_message_meta) is None + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache + assert stage._build_window(control_message) is None -def test_build_window_min_increment( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_min_increment(config: Config, control_message: ControlMessage): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') mock_cache = build_mock_user_cache(count=5, total_count=30, last_train_count=25) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache - assert stage._build_window(dfp_message_meta) is None + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache + assert stage._build_window(control_message) is None -def test_build_window_invalid( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_invalid(config: Config, control_message: ControlMessage, train_df: pd.DataFrame): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') - train_df = dfp_message_meta.copy_dataframe() # exact values not important so long as they don't match the actual hash train_df['_row_hash'] = [-1 for _ in range(len(train_df))] mock_cache = build_mock_user_cache(train_df=train_df) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache with pytest.raises(RuntimeError): - stage._build_window(dfp_message_meta) + stage._build_window(control_message) -def test_build_window_overlap( - config: Config, - dfp_message_meta: "DFPMessageMeta" # noqa: F821 -): +def test_build_window_overlap(config: Config, control_message: ControlMessage, train_df: pd.DataFrame): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') # Create an overlap - train_df = dfp_message_meta.copy_dataframe()[-5:] + train_df = train_df[-5:] train_df['_row_hash'] = pd.util.hash_pandas_object(train_df, index=False) mock_cache = build_mock_user_cache(train_df=train_df) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache with pytest.raises(RuntimeError): - stage._build_window(dfp_message_meta) + stage._build_window(control_message) @pytest.mark.parametrize('use_on_data', [True, False]) -def test_build_window( - config: Config, - use_on_data: bool, - dfp_message_meta: "DFPMessageMeta", # noqa: F821 - dataset_pandas: DatasetManager): - from morpheus.messages import ControlMessage +def test_build_window(config: Config, + use_on_data: bool, + control_message: ControlMessage, + dataset_pandas: DatasetManager, + train_df: pd.DataFrame): from morpheus_dfp.stages.dfp_rolling_window_stage import DFPRollingWindowStage stage = DFPRollingWindowStage(config, min_history=5, min_increment=7, max_history=100, cache_dir='/test/path/cache') # Create an overlap - train_df = dfp_message_meta.copy_dataframe() train_df['_row_hash'] = pd.util.hash_pandas_object(train_df, index=False) mock_cache = build_mock_user_cache(train_df=train_df) - stage._user_cache_map[dfp_message_meta.user_id] = mock_cache + stage._user_cache_map[control_message.get_metadata('user_id')] = mock_cache # on_data is a thin wrapper around _build_window, results should be the same if use_on_data: - msg = stage.on_data(dfp_message_meta) + out_msg = stage.on_data(control_message) else: - msg = stage._build_window(dfp_message_meta) + out_msg = stage._build_window(control_message) - assert isinstance(msg, ControlMessage) - assert msg.get_metadata("user_id") == dfp_message_meta.user_id - assert msg.payload().count == len(dataset_pandas['filter_probs.csv']) - dataset_pandas.assert_df_equal(msg.payload().df, train_df) + assert isinstance(out_msg, ControlMessage) + assert out_msg.get_metadata("user_id") == control_message.get_metadata('user_id') + assert out_msg.payload().count == len(dataset_pandas['filter_probs.csv']) + dataset_pandas.assert_df_equal(out_msg.payload().df, train_df) diff --git a/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py b/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py index 8563fd7f9a..0dca35fd02 100644 --- a/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py +++ b/tests/morpheus_dfp/stages/test_dfp_split_users_stage.py @@ -17,6 +17,7 @@ import os import typing import warnings +from collections import defaultdict import pytest @@ -24,6 +25,7 @@ from _utils.dataset_manager import DatasetManager from morpheus.config import Config from morpheus.pipeline.single_port_stage import SinglePortStage +from morpheus.utils.type_utils import get_df_pkg_from_obj def test_constructor(config: Config): @@ -67,14 +69,24 @@ def test_extract_users(config: Config, from morpheus_dfp.stages.dfp_split_users_stage import DFPSplitUsersStage config.ae.userid_column_name = "From" config.ae.fallback_username = "testy_testerson" + ts_col = config.ae.timestamp_column_name input_file = os.path.join(TEST_DIRS.tests_data_dir, "examples/developer_guide/email_with_addresses_first_10.jsonlines") df = dataset[input_file] + df_pkg = get_df_pkg_from_obj(df) + + # When the file is read using pandas (as is the case in the actual DFP pipeline), the timestamp column is + # automatically converted to datetime objects. However cuDF doesn't do this and the column will contain integers. + # When `dataset` is returning pandas DFs this might still be the case if `input_file` is first read using cuDF and + # cached by the DatasetManager and then converted to pandas. + if df[ts_col].dtype == 'int64': + df[ts_col] = df_pkg.to_datetime(df[ts_col], unit='s') all_data = [] - expected_data = {} + expected_data = defaultdict(list) + with open(input_file, encoding='UTF-8') as fh: for line in fh: json_data = json.loads(line) @@ -85,11 +97,13 @@ def test_extract_users(config: Config, if len(only_users) > 0 and user_id not in only_users: continue + json_data[ts_col] = df_pkg.to_datetime(json_data[ts_col], unit='s') + if include_generic: all_data.append(json_data) if include_individual: - expected_data[user_id] = [json_data] + expected_data[user_id].append(json_data) if include_generic: expected_data[config.ae.fallback_username] = all_data @@ -114,9 +128,11 @@ def test_extract_users(config: Config, # Add one for the generic user assert len(results) == len(expected_data) for msg in results: - assert len(msg.df) == len(expected_data[msg.user_id]) - if msg.user_id != config.ae.fallback_username: - assert msg.df.iloc[0].to_dict() == expected_data[msg.user_id][0] + actual_df = msg.payload().df + user_id = msg.get_metadata('user_id') + assert len(actual_df) == len(expected_data[user_id]) + if user_id != config.ae.fallback_username: + assert actual_df.to_dict('records') == expected_data[user_id] def test_extract_users_none_to_empty(config: Config): diff --git a/tests/morpheus_llm/llm/nodes/test_extractor_node.py b/tests/morpheus_llm/llm/nodes/test_extractor_node.py index 355f16aeb5..38673c95fc 100644 --- a/tests/morpheus_llm/llm/nodes/test_extractor_node.py +++ b/tests/morpheus_llm/llm/nodes/test_extractor_node.py @@ -39,6 +39,7 @@ def test_execute(): df = cudf.DataFrame({"insects": insects.copy(), "mammals": mammals.copy(), "reptiles": reptiles.copy()}) message = ControlMessage() message.payload(MessageMeta(df)) + message.set_metadata("llm_message_meta", message.payload()) task_dict = {"input_keys": ["mammals", "reptiles"]} node = ExtracterNode() diff --git a/tests/morpheus_llm/llm/nodes/test_manual_extractor_node.py b/tests/morpheus_llm/llm/nodes/test_manual_extractor_node.py index 143636999d..7c3ef3542e 100644 --- a/tests/morpheus_llm/llm/nodes/test_manual_extractor_node.py +++ b/tests/morpheus_llm/llm/nodes/test_manual_extractor_node.py @@ -48,6 +48,7 @@ def test_execute(): df = cudf.DataFrame({"insects": insects.copy(), "mammals": mammals.copy(), "reptiles": reptiles.copy()}) message = ControlMessage() message.payload(MessageMeta(df)) + message.set_metadata("llm_message_meta", message.payload()) task_dict = {"input_keys": ["insects"]} node = ManualExtracterNode(["mammals", "reptiles"]) diff --git a/tests/morpheus_llm/llm/task_handlers/test_simple_task_handler.py b/tests/morpheus_llm/llm/task_handlers/test_simple_task_handler.py index 8439d2df3d..641f65b9b4 100644 --- a/tests/morpheus_llm/llm/task_handlers/test_simple_task_handler.py +++ b/tests/morpheus_llm/llm/task_handlers/test_simple_task_handler.py @@ -46,6 +46,7 @@ def test_try_handle(dataset_cudf: DatasetManager): message = ControlMessage() message.payload(MessageMeta(df)) + message.set_metadata("llm_message_meta", message.payload()) task_handler = SimpleTaskHandler(['reptiles']) diff --git a/tests/morpheus_llm/llm/test_vdb_upload_pipe.py b/tests/morpheus_llm/llm/test_vdb_upload_pipe.py index 88a51c631f..85394da57c 100755 --- a/tests/morpheus_llm/llm/test_vdb_upload_pipe.py +++ b/tests/morpheus_llm/llm/test_vdb_upload_pipe.py @@ -31,7 +31,6 @@ @pytest.mark.milvus -@pytest.mark.use_python @pytest.mark.use_pandas @pytest.mark.import_mod([ os.path.join(TEST_DIRS.examples_dir, 'llm/common'), diff --git a/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py b/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py index 4d3935091b..7cb381f15c 100644 --- a/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py +++ b/tests/morpheus_llm/stages/test_llm_engine_stage_pipe.py @@ -16,8 +16,6 @@ import os -import pytest - from _utils import TEST_DIRS from _utils import assert_results from _utils.dataset_manager import DatasetManager @@ -39,8 +37,6 @@ def _build_engine() -> LLMEngine: return engine -@pytest.mark.use_cudf -@pytest.mark.use_python def test_pipeline(config: Config, dataset_cudf: DatasetManager): test_data = os.path.join(TEST_DIRS.validation_data_dir, 'root-cause-validation-data-input.jsonlines') input_df = dataset_cudf[test_data] diff --git a/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py b/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py index 20c8bf243a..b39aa1920d 100755 --- a/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py +++ b/tests/morpheus_llm/stages/test_milvus_write_to_vector_db_stage_pipe.py @@ -45,7 +45,7 @@ def get_test_df(num_input_rows): @pytest.mark.milvus -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.parametrize("use_instance, num_input_rows, expected_num_output_rows, resource_kwargs, recreate", [(True, 5, 5, { "partition_name": "age_partition" diff --git a/tests/test_conftest.py b/tests/test_conftest.py index cc37b918d3..5856152771 100644 --- a/tests/test_conftest.py +++ b/tests/test_conftest.py @@ -21,18 +21,40 @@ import cudf from _utils.dataset_manager import DatasetManager +from morpheus.config import Config from morpheus.config import CppConfig +from morpheus.config import ExecutionMode +from morpheus.utils.type_aliases import DataFrameModule +from morpheus.utils.type_utils import exec_mode_to_df_type_str -@pytest.fixture(name="cpp_from_marker", scope="function") -def cpp_from_marker_fixture(request: pytest.FixtureRequest) -> bool: +def exec_mode_to_cpp_mode(exec_mode: ExecutionMode) -> bool: + return exec_mode == ExecutionMode.GPU - use_cpp = len([x for x in request.node.iter_markers("use_cpp") if "added_by" in x.kwargs]) > 0 - use_python = len([x for x in request.node.iter_markers("use_python") if "added_by" in x.kwargs]) > 0 - assert use_cpp != use_python +@pytest.fixture(name="exec_mode_from_marker", scope="function") +def exec_mode_from_marker_fixture(request: pytest.FixtureRequest) -> ExecutionMode: - return use_cpp + gpu_mode = len([x for x in request.node.iter_markers("gpu_mode") if "added_by" in x.kwargs]) > 0 + cpu_mode = len([x for x in request.node.iter_markers("cpu_mode") if "added_by" in x.kwargs]) > 0 + + assert gpu_mode != cpu_mode + + if gpu_mode: + return ExecutionMode.GPU + + return ExecutionMode.CPU + + +@pytest.fixture(name="cpp_mode_from_marker", scope="function") +def cpp_mode_from_marker_fixture(request: pytest.FixtureRequest) -> bool: + + gpu_mode = len([x for x in request.node.iter_markers("gpu_mode") if "added_by" in x.kwargs]) > 0 + cpu_mode = len([x for x in request.node.iter_markers("cpu_mode") if "added_by" in x.kwargs]) > 0 + + assert gpu_mode != cpu_mode + + return gpu_mode @pytest.fixture(name="df_type_from_marker", scope="function") @@ -117,78 +139,60 @@ def test_no_mark(): # === No Marks === -@pytest.mark.use_cpp -def test_mark_use_cpp(): +@pytest.mark.gpu_mode +def test_mark_gpu_mode(): assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python -def test_mark_use_python(): +@pytest.mark.cpu_mode +def test_mark_cpu_mode(): assert not CppConfig.get_should_use_cpp() -@pytest.mark.use_cpp -@pytest.mark.use_python -def test_mark_both(cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - - # === Marks and Config === -@pytest.mark.use_cpp +@pytest.mark.gpu_mode @pytest.mark.usefixtures("config") -def test_mark_and_config_use_cpp(): +def test_mark_and_config_gpu_mode(): assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python -@pytest.mark.usefixtures("config") -def test_mark_and_config_use_python(): +@pytest.mark.cpu_mode +def test_mark_and_config_cpu_mode(config: Config): assert not CppConfig.get_should_use_cpp() + assert config.execution_mode == ExecutionMode.CPU -@pytest.mark.use_cpp -@pytest.mark.use_python -@pytest.mark.usefixtures("config") -def test_mark_and_config_both(cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker +@pytest.mark.gpu_and_cpu_mode +def test_gpu_and_cpu_mode(config: Config, exec_mode_from_marker: ExecutionMode): + assert config.execution_mode == exec_mode_from_marker -@pytest.mark.usefixtures("config") -def test_mark_and_config_neither(cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker +def test_mark_and_config_neither(config: Config): + assert CppConfig.get_should_use_cpp() + assert config.execution_mode == ExecutionMode.GPU # === Fixture === -@pytest.mark.use_cpp -def test_fixture_use_cpp(use_cpp: bool): - assert use_cpp +@pytest.mark.gpu_mode +def test_fixture_gpu_mode(execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python -def test_fixture_use_python(use_cpp: bool): - assert not use_cpp +@pytest.mark.cpu_mode +def test_fixture_cpu_mode(execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() -@pytest.mark.use_cpp -@pytest.mark.use_python -def test_fixture_both(use_cpp: bool): - assert CppConfig.get_should_use_cpp() == use_cpp - - -def test_fixture_neither(use_cpp: bool): - assert CppConfig.get_should_use_cpp() == use_cpp +def test_fixture_neither(execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU + assert CppConfig.get_should_use_cpp() # === Config Fixture === -@pytest.mark.usefixtures("config_no_cpp") -def test_config_fixture_no_cpp(): - assert not CppConfig.get_should_use_cpp() - - -@pytest.mark.usefixtures("config_only_cpp") -def test_config_fixture_only_cpp(): +@pytest.mark.usefixtures("config") +def test_config_fixture(): assert CppConfig.get_should_use_cpp() @@ -197,67 +201,62 @@ class TestNoMarkerClass: def test_no_marker(self): assert CppConfig.get_should_use_cpp() - @pytest.mark.use_python - def test_python_marker(self): + @pytest.mark.cpu_mode + def test_python_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp - def test_cpp_marker(self): + @pytest.mark.gpu_mode + def test_cpp_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU assert CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp - @pytest.mark.use_python - def test_marker_both(self, cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - @pytest.mark.slow - def test_other_marker(self, use_cpp: bool): - assert CppConfig.get_should_use_cpp() == use_cpp + def test_other_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU + assert CppConfig.get_should_use_cpp() -@pytest.mark.use_python +@pytest.mark.cpu_mode class TestPythonMarkerClass: def test_no_marker(self): assert not CppConfig.get_should_use_cpp() - def test_with_fixture(self, use_cpp: bool): - assert not use_cpp + def test_with_fixture(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() - @pytest.mark.use_python - def test_extra_marker(self): + @pytest.mark.cpu_mode + def test_extra_marker(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.CPU assert not CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp - def test_add_marker(self, cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - -@pytest.mark.use_cpp +@pytest.mark.gpu_mode class TestCppMarkerClass: def test_no_marker(self): assert CppConfig.get_should_use_cpp() - def test_with_fixture(self, use_cpp: bool): - assert use_cpp + def test_with_fixture(self, execution_mode: ExecutionMode): + assert execution_mode == ExecutionMode.GPU assert CppConfig.get_should_use_cpp() - @pytest.mark.use_cpp + @pytest.mark.gpu_mode def test_extra_marker(self): assert CppConfig.get_should_use_cpp() - @pytest.mark.use_python - def test_add_marker(self, cpp_from_marker: bool): - assert CppConfig.get_should_use_cpp() == cpp_from_marker - # === DF Type === def test_df_type_no_marks(df_type, df_type_from_marker): assert df_type == df_type_from_marker +def test_df_type_matches_execution_mode(df_type: DataFrameModule, execution_mode: ExecutionMode): + assert df_type == exec_mode_to_df_type_str(execution_mode) + + @pytest.mark.use_pandas def test_df_type_pandas_marker(df_type): assert df_type == "pandas" From bd246d5dba814a2f97fd1ef55862a4e240c2604a Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Tue, 22 Oct 2024 12:09:12 -0400 Subject: [PATCH 6/7] Remove TrainAEStage issue from known issues --- docs/source/extra_info/known_issues.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/extra_info/known_issues.md b/docs/source/extra_info/known_issues.md index 7feb1dca62..b009da5ce9 100644 --- a/docs/source/extra_info/known_issues.md +++ b/docs/source/extra_info/known_issues.md @@ -17,7 +17,6 @@ limitations under the License. # Known Issues -- TrainAEStage fails with a Segmentation fault ([#1641](https://github.com/nv-morpheus/Morpheus/issues/1641)) - `vdb_upload` example pipeline triggers an internal error in Triton ([#1649](https://github.com/nv-morpheus/Morpheus/issues/1649)) Refer to [open issues in the Morpheus project](https://github.com/nv-morpheus/Morpheus/issues) From 54a18365da9e62e1a6455f3160367d7dba4afee3 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Tue, 22 Oct 2024 13:08:57 -0400 Subject: [PATCH 7/7] Add back starter dfp source stages and remove PipelineModes.AE --- docs/source/basics/overview.rst | 2 +- python/morpheus/morpheus/config.py | 1 - .../stages/input/autoencoder_source_stage.py | 341 ++++++++++++++++++ .../stages/input/azure_source_stage.py | 177 +++++++++ .../stages/input/cloud_trail_source_stage.py | 184 ++++++++++ .../control_message_kafka_source_stage.py | 3 - .../morpheus/stages/input/duo_source_stage.py | 172 +++++++++ .../stages/postprocess/timeseries_stage.py | 3 - tests/morpheus/test_config.py | 2 +- 9 files changed, 876 insertions(+), 9 deletions(-) create mode 100644 python/morpheus/morpheus/stages/input/autoencoder_source_stage.py create mode 100644 python/morpheus/morpheus/stages/input/azure_source_stage.py create mode 100644 python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py create mode 100644 python/morpheus/morpheus/stages/input/duo_source_stage.py diff --git a/docs/source/basics/overview.rst b/docs/source/basics/overview.rst index 61aef2e8b6..a93ff35fda 100644 --- a/docs/source/basics/overview.rst +++ b/docs/source/basics/overview.rst @@ -27,7 +27,7 @@ The Morpheus CLI is built on the Click Python package which allows for nested co together. At a high level, the CLI is broken up into two main sections: * ``run`` - * For running AE, FIL, NLP or OTHER pipelines. + * For running FIL, NLP or OTHER pipelines. * ``tools`` * Tools/Utilities to help set up, configure and run pipelines and external resources. diff --git a/python/morpheus/morpheus/config.py b/python/morpheus/morpheus/config.py index 2b0073103e..2bc589a186 100644 --- a/python/morpheus/morpheus/config.py +++ b/python/morpheus/morpheus/config.py @@ -137,7 +137,6 @@ class PipelineModes(str, Enum): OTHER = "OTHER" NLP = "NLP" FIL = "FIL" - AE = "AE" class ExecutionMode(str, Enum): diff --git a/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py b/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py new file mode 100644 index 0000000000..7174650893 --- /dev/null +++ b/python/morpheus/morpheus/stages/input/autoencoder_source_stage.py @@ -0,0 +1,341 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from abc import abstractmethod +from functools import partial + +import mrc +import pandas as pd +from mrc.core import operators as ops + +from morpheus.common import FileTypes +from morpheus.config import Config +from morpheus.messages import ControlMessage +from morpheus.messages import MessageMeta +from morpheus.pipeline.execution_mode_mixins import GpuAndCpuMixin +from morpheus.pipeline.preallocator_mixin import PreallocatorMixin +from morpheus.pipeline.single_output_source import SingleOutputSource +from morpheus.pipeline.stage_schema import StageSchema +from morpheus.utils.directory_watcher import DirectoryWatcher + + +class AutoencoderSourceStage(PreallocatorMixin, GpuAndCpuMixin, SingleOutputSource): + """ + All AutoEncoder source stages must extend this class and implement the `files_to_dfs_per_user` abstract method. + Feature columns can be managed by overriding the `derive_features` method. Otherwise, all columns from input + data pass through to next stage. + + Extend this class to load messages from a files and dump contents into a DFP pipeline immediately. Useful for + testing performance and accuracy of a pipeline. + + Parameters + ---------- + c : `morpheus.config.Config` + Pipeline configuration instance. + input_glob : str + Input glob pattern to match files to read. For example, `./input_dir/*.json` would read all files with the + 'json' extension in the directory input_dir. + watch_directory : bool, default = False + The watch directory option instructs this stage to not close down once all files have been read. Instead it will + read all files that match the 'input_glob' pattern, and then continue to watch the directory for additional + files. Any new files that are added that match the glob will then be processed. + max_files: int, default = -1 + Max number of files to read. Useful for debugging to limit startup time. Default value of -1 is unlimited. + file_type : `morpheus.common.FileTypes`, default = 'FileTypes.Auto'. + Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. + Supported extensions: 'json', 'csv' + repeat: int, default = 1 + How many times to repeat the dataset. Useful for extending small datasets in debugging. + sort_glob : bool, default = False + If true the list of files matching `input_glob` will be processed in sorted order. + recursive: bool, default = True + If true, events will be emitted for the files in subdirectories that match `input_glob`. + queue_max_size: int, default = 128 + Maximum queue size to hold the file paths to be processed that match `input_glob`. + batch_timeout: float, default = 5.0 + Timeout to retrieve batch messages from the queue. + """ + + def __init__(self, + c: Config, + input_glob: str, + watch_directory: bool = False, + max_files: int = -1, + file_type: FileTypes = FileTypes.Auto, + repeat: int = 1, + sort_glob: bool = False, + recursive: bool = True, + queue_max_size: int = 128, + batch_timeout: float = 5.0): + + SingleOutputSource.__init__(self, c) + + self._input_glob = input_glob + self._file_type = file_type + + self._feature_columns = c.ae.feature_columns + self._user_column_name = c.ae.userid_column_name + self._userid_filter = c.ae.userid_filter + + self._input_count = None + + # Hold the max index we have seen to ensure sequential and increasing indexes + self._rows_per_user: dict[str, int] = {} + + # Iterative mode will emit dataframes one at a time. Otherwise a list of dataframes is emitted. Iterative mode + # is good for interleaving source stages. + self._repeat_count = repeat + + self._df_class = self.get_df_class() + + self._watcher = DirectoryWatcher(input_glob=input_glob, + watch_directory=watch_directory, + max_files=max_files, + sort_glob=sort_glob, + recursive=recursive, + queue_max_size=queue_max_size, + batch_timeout=batch_timeout, + should_stop_fn=self.is_stop_requested) + + @property + def input_count(self) -> int: + """Return None for no max input count""" + return self._input_count if self._input_count is not None else 0 + + def compute_schema(self, schema: StageSchema): + schema.output_schema.set_type(ControlMessage) + + def get_match_pattern(self, glob_split): + """Return a file match pattern""" + dir_to_watch = os.path.dirname(glob_split[0]) + match_pattern = self._input_glob.replace(dir_to_watch + "/", "", 1) + + return match_pattern + + @staticmethod + def repeat_df(df: pd.DataFrame, repeat_count: int) -> list[pd.DataFrame]: + """ + This function iterates over the same dataframe to extending small datasets in debugging with incremental + updates to the `event_dt` and `eventTime` columns. + + Parameters + ---------- + df : pd.DataFrame + To be repeated dataframe. + repeat_count : int + Number of times the given dataframe should be repeated. + + Returns + ------- + df_array : list[pd.DataFrame] + List of repeated dataframes. + """ + + df_array = [] + + df_array.append(df) + + for _ in range(1, repeat_count): + x = df.copy() + + # Now increment the timestamps by the interval in the df + x["event_dt"] = x["event_dt"] + (x["event_dt"].iloc[-1] - x["event_dt"].iloc[0]) + x["eventTime"] = x["event_dt"].dt.strftime("%Y-%m-%dT%H:%M:%SZ") + + df_array.append(x) + + # Set df for next iteration + df = x + + return df_array + + @staticmethod + def batch_user_split(x: list[pd.DataFrame], + userid_column_name: str, + userid_filter: str, + datetime_column_name="event_dt"): + """ + Creates a dataframe for each userid. + + Parameters + ---------- + x : list[pd.DataFrame] + List of dataframes. + userid_column_name : str + Name of a dataframe column used for categorization. + userid_filter : str + Only rows with the supplied userid are filtered. + datetime_column_name : str + Name of the dataframe column used to sort the rows. + + Returns + ------- + user_dfs : dict[str, pd.DataFrame] + Dataframes, each of which is associated with a single userid. + """ + + combined_df = pd.concat(x) + + if (datetime_column_name in combined_df): + + # Convert to date_time column + # combined_df["event_dt"] = pd.to_datetime(combined_df["eventTime"]) + + # Set the index name so we can sort first by time then by index (to keep things all in order). Then restore + # the name + saved_index_name = combined_df.index.name + + combined_df.index.name = "idx" + + # Sort by time + combined_df = combined_df.sort_values(by=[datetime_column_name, "idx"]) + + combined_df.index.name = saved_index_name + + # Get the users in this DF + unique_users = combined_df[userid_column_name].unique() + + user_dfs = {} + + for user_name in unique_users: + + if (userid_filter is not None and user_name != userid_filter): + continue + + # Get just this users data and make a copy to remove link to grouped DF + user_df = combined_df[combined_df[userid_column_name] == user_name].copy() + + user_dfs[user_name] = user_df + + return user_dfs + + @staticmethod + @abstractmethod + def files_to_dfs_per_user(x: list[str], + userid_column_name: str, + feature_columns: list[str], + userid_filter: str = None, + repeat_count: int = 1) -> dict[str, pd.DataFrame]: + """ + Stages that extend `AutoencoderSourceStage` must implement this abstract function + in order to convert messages in the files to dataframes per userid. + + Parameters + ---------- + x : list[str] + List of messages. + userid_column_name : str + Name of the column used for categorization. + feature_columns : list[str] + Feature column names. + userid_filter : str + Only rows with the supplied userid are filtered. + repeat_count : str + Number of times the given rows should be repeated. + + Returns + ------- + : dict[str, pd.DataFrame] + Dataframe per userid. + """ + + pass + + @staticmethod + def derive_features(df: pd.DataFrame, feature_columns: list[str] | None): # pylint: disable=unused-argument + """ + If any features are available to be derived, can be implemented by overriding this function. + + Parameters + ---------- + df : pd.DataFrame + A dataframe. + feature_columns : list[str] + Names of columns that are need to be derived. + + Returns + ------- + df : list[pd.DataFrame] + Dataframe with actual and derived columns. + """ + return df + + def _add_derived_features(self, user_dataframes: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]: + + for user_name in user_dataframes.keys(): + user_dataframes[user_name] = self.derive_features(user_dataframes[user_name], None) + + return user_dataframes + + def _build_message(self, user_dataframes: dict[str, pd.DataFrame]) -> list[ControlMessage]: + + messages = [] + + for user_name, user_df in user_dataframes.items(): + + # See if we have seen this user before + if (user_name not in self._rows_per_user): + self._rows_per_user[user_name] = 0 + + # Combine the original index with itself so it shows up as a named column + user_df.index.name = "_index_" + (user_df.index.name or "") + user_df = user_df.reset_index() + + # Now ensure the index for this user is correct + user_df.index = range(self._rows_per_user[user_name], self._rows_per_user[user_name] + len(user_df)) + self._rows_per_user[user_name] += len(user_df) + + # If we're in GPU mode we need to convert to cuDF + if not isinstance(user_df, self._df_class): + for col in [col for col in user_df.columns if isinstance(user_df[col].dtype, pd.DatetimeTZDtype)]: + user_df[col] = user_df[col].dt.tz_convert(None) + + user_df = self._df_class(user_df) + + # Now make a message with the user name in metadata + meta = MessageMeta(user_df) + message = ControlMessage() + message.payload(meta) + message.set_metadata("user_id", user_name) + + messages.append(message) + + return messages + + def _build_source(self, builder: mrc.Builder) -> mrc.SegmentObject: + # The first source just produces filenames + return self._watcher.build_node(self.unique_name, builder) + + def _post_build_single(self, builder: mrc.Builder, out_node: mrc.SegmentObject) -> mrc.SegmentObject: + + # At this point, we have batches of filenames to process. Make a node for processing batches of + # filenames into batches of dataframes + post_node = builder.make_node( + self.unique_name + "-post", + ops.map( + partial( + self.files_to_dfs_per_user, + userid_column_name=self._user_column_name, + feature_columns=None, # Use None here to leave all columns in + userid_filter=self._userid_filter, + repeat_count=self._repeat_count)), + ops.map(self._add_derived_features), + # Now group the batch of dataframes into a single df, split by user, and send a single ControlMessage + # per user + ops.map(self._build_message), + ops.flatten()) + builder.make_edge(out_node, post_node) + + return super()._post_build_single(builder, post_node) diff --git a/python/morpheus/morpheus/stages/input/azure_source_stage.py b/python/morpheus/morpheus/stages/input/azure_source_stage.py new file mode 100644 index 0000000000..ccab3f3cc2 --- /dev/null +++ b/python/morpheus/morpheus/stages/input/azure_source_stage.py @@ -0,0 +1,177 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import typing + +import pandas as pd + +from morpheus.stages.input.autoencoder_source_stage import AutoencoderSourceStage + +logger = logging.getLogger(__name__) + + +class AzureSourceStage(AutoencoderSourceStage): + """ + Source stage is used to load Azure Active Directory messages. + + Adds the following derived features: + - `appincrement`: Increments every time the logs contain a distinct app. + - `locincrement`: Increments every time a log contains a distinct city within a day. + - `logcount`: Tracks the number of logs generated by a user within a day. + + Parameters + ---------- + c : `morpheus.config.Config` + Pipeline configuration instance. + input_glob : str + Input glob pattern to match files to read. For example, `./input_dir/*.json` would read all files with the + 'json' extension in the directory input_dir. + watch_directory : bool, default = False + The watch directory option instructs this stage to not close down once all files have been read. Instead it will + read all files that match the 'input_glob' pattern, and then continue to watch the directory for additional + files. Any new files that are added that match the glob will then be processed. + max_files: int, default = -1 + Max number of files to read. Useful for debugging to limit startup time. Default value of -1 is unlimited. + file_type : `morpheus.common.FileTypes`, default = 'FileTypes.Auto'. + Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. + Supported extensions: 'json', 'csv' + repeat: int, default = 1 + How many times to repeat the dataset. Useful for extending small datasets in debugging. + sort_glob : bool, default = False + If true the list of files matching `input_glob` will be processed in sorted order. + recursive: bool, default = True + If true, events will be emitted for the files in subdirectories that match `input_glob`. + queue_max_size: int, default = 128 + Maximum queue size to hold the file paths to be processed that match `input_glob`. + batch_timeout: float, default = 5.0 + Timeout to retrieve batch messages from the queue. + """ + + @property + def name(self) -> str: + return "from-azure" + + def supports_cpp_node(self): + return False + + @staticmethod + def change_columns(df): + """ + Removes characters (_,.,{,},:) from the names of the dataframe columns. + + Parameters + ---------- + df : `pd.DataFrame` + Dataframe that requires column renaming. + + Returns + ------- + df : `pd.DataFrame` + Dataframe with renamed columns. + """ + + df.columns = df.columns.str.replace('[_,.,{,},:]', '') + df.columns = df.columns.str.strip() + return df + + @staticmethod + def derive_features(df: pd.DataFrame, feature_columns: typing.List[str]): + """ + Derives feature columns from the AzureAD (logs) source columns. + + Parameters + ---------- + df : pd.DataFrame + Dataframe for deriving columns. + feature_columns : typing.List[str] + Names of columns that are need to be derived. + + Returns + ------- + df : typing.List[pd.DataFrame] + Dataframe with actual and derived columns. + """ + + default_date = '1970-01-01T00:00:00.000000+00:00' + timestamp_column = "createdDateTime" + city_column = "locationcity" + state_column = "locationstate" + country_column = "locationcountryOrRegion" + application_column = "appDisplayName" + + df = AzureSourceStage.change_columns(df) + df['time'] = pd.to_datetime(df[timestamp_column], errors='coerce') + df['day'] = df['time'].dt.date + df.fillna({'time': pd.to_datetime(default_date), 'day': pd.to_datetime(default_date).date()}, inplace=True) + df.sort_values(by=['time'], inplace=True) + + overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None] + overall_location_df = df[overall_location_columns].fillna('nan') + df['overall_location'] = overall_location_df.apply(lambda x: ', '.join(x), axis=1) + df['loc_cat'] = df.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) + df.fillna({'loc_cat': 1}, inplace=True) + df['locincrement'] = df.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) + df.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) + + df['app_cat'] = df.groupby('day')[application_column].transform(lambda x: pd.factorize(x)[0] + 1) + df.fillna({'app_cat': 1}, inplace=True) + df['appincrement'] = df.groupby('day')['app_cat'].expanding(1).max().droplevel(0) + df.drop('app_cat', inplace=True, axis=1) + + df["logcount"] = df.groupby('day').cumcount() + + if (feature_columns is not None): + df.drop(columns=df.columns.difference(feature_columns), inplace=True) + + return df + + @staticmethod + def files_to_dfs_per_user(x: typing.List[str], + userid_column_name: str, + feature_columns: typing.List[str], + userid_filter: str = None, + repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: + """ + After loading the input batch of AzureAD logs into a dataframe, this method builds a dataframe + for each set of userid rows in accordance with the specified filter condition. + + Parameters + ---------- + x : typing.List[str] + List of messages. + userid_column_name : str + Name of the column used for categorization. + feature_columns : typing.List[str] + Feature column names. + userid_filter : str + Only rows with the supplied userid are filtered. + repeat_count : str + Number of times the given rows should be repeated. + + Returns + ------- + df_per_user : typing.Dict[str, pd.DataFrame] + Dataframe per userid. + """ + + dfs = [] + for file in x: + df = pd.read_json(file, orient="records") + df = pd.json_normalize(df['properties']) + dfs = dfs + AutoencoderSourceStage.repeat_df(df, repeat_count) + + df_per_user = AutoencoderSourceStage.batch_user_split(dfs, userid_column_name, userid_filter) + + return df_per_user diff --git a/python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py b/python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py new file mode 100644 index 0000000000..ec78097a25 --- /dev/null +++ b/python/morpheus/morpheus/stages/input/cloud_trail_source_stage.py @@ -0,0 +1,184 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import typing + +import numpy as np +import pandas as pd + +from morpheus.common import FileTypes +from morpheus.common import determine_file_type +from morpheus.io.deserializers import read_file_to_df +from morpheus.stages.input.autoencoder_source_stage import AutoencoderSourceStage + +logger = logging.getLogger(__name__) + + +class CloudTrailSourceStage(AutoencoderSourceStage): + """ + Load messages from a CloudTrail directory. + + """ + + @property + def name(self) -> str: + return "from-cloudtrail" + + @property + def input_count(self) -> int: + """Return None for no max intput count""" + return self._input_count + + def supports_cpp_node(self): + return False + + def get_match_pattern(self, glob_split): + """Return a file match pattern""" + dir_to_watch = os.path.dirname(glob_split[0]) + match_pattern = self._input_glob.replace(dir_to_watch + "/", "", 1) + + return match_pattern + + @staticmethod + def read_file(filename: str, file_type: FileTypes) -> pd.DataFrame: + """ + Reads a file into a dataframe. + + Parameters + ---------- + filename : str + Path to a file to read. + file_type : `morpheus.common.FileTypes` + What type of file to read. Leave as Auto to auto detect based on the file extension. + + Returns + ------- + pandas.DataFrame + The parsed dataframe. + + Raises + ------ + RuntimeError + If an unsupported file type is detected. + """ + + df = read_file_to_df(filename, file_type, df_type="pandas") + + # If reading the file only produced one line and we are a JSON file, try loading structured file + if (determine_file_type(filename) == FileTypes.JSON and len(df) == 1 and list(df) == ["Records"]): + + # Reread with lines=False + df = read_file_to_df(filename, file_type, df_type="pandas", parser_kwargs={"lines": False}) + + # Normalize + df = pd.json_normalize(df['Records']) + + return df + + @staticmethod + def cleanup_df(df: pd.DataFrame, feature_columns: typing.List[str]): + """ + This function does clean up certain columns in the dataframe. + + Parameters + ---------- + df : pd.DataFrame + Dataframe for columns cleanup. + feature_columns : typing.List[str] + Only the columns that are present in the feature columns will be preserved in the dataframe + if feature columns are supplied.. + + Returns + ------- + df : typing.List[pd.DataFrame] + Clean dataframe. + """ + + # Replace all the dots in column names + df.columns = df.columns.str.replace('.', '', regex=False) + + df["event_dt"] = pd.to_datetime(df["eventTime"]) + + def remove_null(x): + + if isinstance(x, list): + if isinstance(x[0], dict): + key = list(x[0].keys()) + return x[0][key[0]] + return x + + def clean_column(cloudtrail_df): + + col_name = 'requestParametersownersSetitems' + if (col_name in cloudtrail_df): + cloudtrail_df[col_name] = cloudtrail_df[col_name].apply(lambda x: remove_null(x)) + return cloudtrail_df + + # Drop any unneeded columns if specified + if (feature_columns is not None): + df.drop(columns=df.columns.difference(feature_columns), inplace=True) + + # Reorder columns to be the same + # df = df[pd.Index(feature_columns).intersection(df.columns)] + + # Convert a numerical account ID into a string + if ("userIdentityaccountId" in df and df["userIdentityaccountId"].dtype != np.dtype('O')): + df['userIdentityaccountId'] = 'Account-' + df['userIdentityaccountId'].astype(str) + + df = clean_column(df) + + return df + + @staticmethod + def files_to_dfs_per_user(x: typing.List[str], + userid_column_name: str, + feature_columns: typing.List[str], + userid_filter: str = None, + repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: + """ + After loading the input batch of CloudTrail logs into a dataframe, this method builds a dataframe + for each set of userid rows in accordance with the specified filter condition. + + Parameters + ---------- + x : typing.List[str] + List of messages. + userid_column_name : str + Name of the column used for categorization. + feature_columns : typing.List[str] + Feature column names. + userid_filter : str + Only rows with the supplied userid are filtered. + repeat_count : str + Number of times the given rows should be repeated. + + Returns + ------- + df_per_user : typing.Dict[str, pd.DataFrame] + Dataframe per userid. + """ + + # Using pandas to parse nested JSON until cuDF adds support + # https://github.com/rapidsai/cudf/issues/8827 + dfs = [] + for file in x: + df = CloudTrailSourceStage.read_file(file, FileTypes.Auto) + df = CloudTrailSourceStage.cleanup_df(df, feature_columns) + dfs = dfs + CloudTrailSourceStage.repeat_df(df, repeat_count) + + df_per_user = CloudTrailSourceStage.batch_user_split(dfs, userid_column_name, userid_filter) + + return df_per_user diff --git a/python/morpheus/morpheus/stages/input/control_message_kafka_source_stage.py b/python/morpheus/morpheus/stages/input/control_message_kafka_source_stage.py index 6e7967c1d5..7f74bbb5a1 100644 --- a/python/morpheus/morpheus/stages/input/control_message_kafka_source_stage.py +++ b/python/morpheus/morpheus/stages/input/control_message_kafka_source_stage.py @@ -21,9 +21,7 @@ import mrc import pandas as pd -from morpheus.cli.register_stage import register_stage from morpheus.config import Config -from morpheus.config import PipelineModes from morpheus.messages import ControlMessage from morpheus.pipeline.preallocator_mixin import PreallocatorMixin from morpheus.pipeline.single_output_source import SingleOutputSource @@ -33,7 +31,6 @@ logger = logging.getLogger(__name__) -@register_stage("from-cm-kafka", modes=[PipelineModes.AE]) class ControlMessageKafkaSourceStage(PreallocatorMixin, SingleOutputSource): """ Load control messages from a Kafka cluster. diff --git a/python/morpheus/morpheus/stages/input/duo_source_stage.py b/python/morpheus/morpheus/stages/input/duo_source_stage.py new file mode 100644 index 0000000000..c4645cce0e --- /dev/null +++ b/python/morpheus/morpheus/stages/input/duo_source_stage.py @@ -0,0 +1,172 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Sourse stage for Duo Authentication logs.""" + +import json +import logging +import typing + +import pandas as pd + +from morpheus.stages.input.autoencoder_source_stage import AutoencoderSourceStage + +DEFAULT_DATE = '1970-01-01T00:00:00.000000+00:00' +logger = logging.getLogger(__name__) + + +class DuoSourceStage(AutoencoderSourceStage): + """ + Source stage is used to load Duo Authentication messages. + + Adds the following derived features: + - `locincrement`: Increments every time a log contains a distinct city within a day. + - `logcount`: Tracks the number of logs generated by a user within a day. + + Parameters + ---------- + c : `morpheus.config.Config` + Pipeline configuration instance. + input_glob : str + Input glob pattern to match files to read. For example, `./input_dir/*.json` would read all files with the + 'json' extension in the directory input_dir. + watch_directory : bool, default = False + The watch directory option instructs this stage to not close down once all files have been read. Instead it will + read all files that match the 'input_glob' pattern, and then continue to watch the directory for additional + files. Any new files that are added that match the glob will then be processed. + max_files: int, default = -1 + Max number of files to read. Useful for debugging to limit startup time. Default value of -1 is unlimited. + file_type : `morpheus.common.FileTypes`, default = 'FileTypes.Auto'. + Indicates what type of file to read. Specifying 'auto' will determine the file type from the extension. + Supported extensions: 'json', 'csv' + repeat: int, default = 1 + How many times to repeat the dataset. Useful for extending small datasets in debugging. + sort_glob : bool, default = False + If true the list of files matching `input_glob` will be processed in sorted order. + recursive: bool, default = True + If true, events will be emitted for the files in subdirectories that match `input_glob`. + queue_max_size: int, default = 128 + Maximum queue size to hold the file paths to be processed that match `input_glob`. + batch_timeout: float, default = 5.0 + Timeout to retrieve batch messages from the queue. + """ + + @property + def name(self) -> str: + """Unique name for the stage.""" + return "from-duo" + + def supports_cpp_node(self): + """Indicate that this stages does not support a C++ node.""" + return False + + @staticmethod + def change_columns(df): + """ + Removes characters (_,.,{,},:) from the names of the dataframe columns. + + Parameters + ---------- + df : `pd.DataFrame` + Dataframe that requires column renaming. + + Returns + ------- + df : `pd.DataFrame` + Dataframe with renamed columns. + """ + df.columns = df.columns.str.replace('[_,.,{,},:]', '') + df.columns = df.columns.str.strip() + return df + + @staticmethod + def derive_features(df: pd.DataFrame, feature_columns: typing.List[str]): + """ + Derives feature columns from the DUO (logs) source columns. + + Parameters + ---------- + df : pd.DataFrame + Dataframe for deriving columns. + feature_columns : typing.List[str] + Names of columns that are need to be derived. + + Returns + ------- + df : typing.List[pd.DataFrame] + Dataframe with actual and derived columns. + """ + timestamp_column = "isotimestamp" + city_column = "accessdevicelocationcity" + state_column = "accessdevicelocationstate" + country_column = "accessdevicelocationcountry" + + df['time'] = pd.to_datetime(df[timestamp_column], errors='coerce') + df['day'] = df['time'].dt.date + df.fillna({'time': pd.to_datetime(DEFAULT_DATE), 'day': pd.to_datetime(DEFAULT_DATE).date()}, inplace=True) + df.sort_values(by=['time'], inplace=True) + + overall_location_columns = [col for col in [city_column, state_column, country_column] if col is not None] + overall_location_df = df[overall_location_columns].fillna('nan') + df['overall_location'] = overall_location_df.apply(lambda x: ', '.join(x), axis=1) + df['loc_cat'] = df.groupby('day')['overall_location'].transform(lambda x: pd.factorize(x)[0] + 1) + df.fillna({'loc_cat': 1}, inplace=True) + df['locincrement'] = df.groupby('day')['loc_cat'].expanding(1).max().droplevel(0) + df.drop(['overall_location', 'loc_cat'], inplace=True, axis=1) + + df["logcount"] = df.groupby('day').cumcount() + + if (feature_columns is not None): + df.drop(columns=df.columns.difference(feature_columns), inplace=True) + + return df + + @staticmethod + def files_to_dfs_per_user(x: typing.List[str], + userid_column_name: str, + feature_columns: typing.List[str], + userid_filter: str = None, + repeat_count: int = 1) -> typing.Dict[str, pd.DataFrame]: + """ + After loading the input batch of DUO logs into a dataframe, this method builds a dataframe + for each set of userid rows in accordance with the specified filter condition. + + Parameters + ---------- + x : typing.List[str] + List of messages. + userid_column_name : str + Name of the column used for categorization. + feature_columns : typing.List[str] + Feature column names. + userid_filter : str + Only rows with the supplied userid are filtered. + repeat_count : str + Number of times the given rows should be repeated. + + Returns + ------- + df_per_user : typing.Dict[str, pd.DataFrame] + Dataframe per userid. + """ + dfs = [] + for file in x: + with open(file, encoding='UTF-8') as json_in: + log = json.load(json_in) + df = pd.json_normalize(log) + df = DuoSourceStage.change_columns(df) + dfs = dfs + AutoencoderSourceStage.repeat_df(df, repeat_count) + + df_per_user = AutoencoderSourceStage.batch_user_split(dfs, userid_column_name, userid_filter) + + return df_per_user diff --git a/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py b/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py index 5d7e5d5a67..cdda8d5b12 100644 --- a/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py +++ b/python/morpheus/morpheus/stages/postprocess/timeseries_stage.py @@ -25,9 +25,7 @@ import pandas as pd from mrc.core import operators as ops -from morpheus.cli.register_stage import register_stage from morpheus.config import Config -from morpheus.config import PipelineModes from morpheus.messages import ControlMessage from morpheus.pipeline.pass_thru_type_mixin import PassThruTypeMixin from morpheus.pipeline.single_port_stage import SinglePortStage @@ -403,7 +401,6 @@ def _calc_timeseries(self, x: ControlMessage, is_complete: bool): return output_messages -@register_stage("timeseries", modes=[PipelineModes.AE]) class TimeSeriesStage(PassThruTypeMixin, SinglePortStage): """ Perform time series anomaly detection and add prediction. diff --git a/tests/morpheus/test_config.py b/tests/morpheus/test_config.py index f958ce2f8b..746acf3771 100755 --- a/tests/morpheus/test_config.py +++ b/tests/morpheus/test_config.py @@ -88,7 +88,7 @@ def test_auto_encoder(): def test_pipeline_modes(): - expected = {"OTHER", "NLP", "FIL", "AE"} + expected = {"OTHER", "NLP", "FIL"} entries = set(pm.name for pm in morpheus.config.PipelineModes) assert entries.issuperset(expected)