Skip to content

Commit

Permalink
Update docker ml (demisto#35081)
Browse files Browse the repository at this point in the history
* updated docker

* added the rest

* devdemisto/ml:1.0.0.100486

* fix tpb

* return on no incidents

* remove runonce

* remove space

* fixed

* fix create incidents script

* new docker

* revert: fix create incidents script

* add outputs to DBotFindSimilarIncidents

* new tpb DBotFindSimilarIncidents-test

* new docker

* bump transformers

* Empty-Commit

* fix conf.json

* more fixes

* more fixes

* new docker

* RN

* new docker

* revert dockers

* more stuff

* redirect stderr

* docker

* format

* format

* RN

* more stuff

* build fixes

* build fixes

* fix unit-tests

* more docker changes

* more docker changes

* build fixes

* suppress logger

* build fixes

* build fixes
  • Loading branch information
jlevypaloalto authored Jul 11, 2024
1 parent e3c3b60 commit 83ba6ce
Show file tree
Hide file tree
Showing 40 changed files with 1,144 additions and 917 deletions.
24 changes: 24 additions & 0 deletions Packs/Base/ReleaseNotes/1_34_28.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

#### Scripts

##### DBotTrainTextClassifierV2

- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
##### DBotBuildPhishingClassifier

- Changed the Docker image to: *demisto/python3:3.11.9.101916*.
##### DBotPreProcessTextData

- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
##### DBotPredictPhishingWords

- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
##### DBotFindSimilarIncidents

- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
##### GetMLModelEvaluation

- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
##### DBotFindSimilarIncidentsByIndicators

- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
from CommonServerPython import *
import base64
import copy
import gc

from CommonServerPython import *

PREFIXES_TO_REMOVE = ['incident.']
ALL_LABELS = "*"


def preprocess_incidents_field(incidents_field):
incidents_field = incidents_field.strip()
for prefix in PREFIXES_TO_REMOVE:
if incidents_field.startswith(prefix):
incidents_field = incidents_field[len(prefix):]
return incidents_field
return incidents_field.strip().removeprefix('incident.')


def get_phishing_map_labels(comma_values):
Expand All @@ -28,7 +21,7 @@ def get_phishing_map_labels(comma_values):
labels_dict[splited[0].strip()] = splited[1].strip()
else:
labels_dict[v] = v
return {k: v for k, v in labels_dict.items()}
return dict(labels_dict.items())


def build_query_in_reepect_to_phishing_labels(args):
Expand All @@ -38,17 +31,17 @@ def build_query_in_reepect_to_phishing_labels(args):
return args
mapping_dict = get_phishing_map_labels(mapping)
tag_field = args['tagField']
tags_union = ' '.join(['"{}"'.format(label) for label in mapping_dict])
mapping_query = '{}:({})'.format(tag_field, tags_union)
tags_union = ' '.join([f'"{label}"' for label in mapping_dict])
mapping_query = f'{tag_field}:({tags_union})'
if 'query' not in args or args['query'].strip() == '':
args['query'] = mapping_query
else:
args['query'] = '({}) and ({})'.format(query, mapping_query)
args['query'] = f'({query}) and ({mapping_query})'
return args


def get_incidents(d_args):
get_incidents_by_query_args = copy.deepcopy(d_args)
get_incidents_by_query_args = d_args.copy()
get_incidents_by_query_args['NonEmptyFields'] = d_args['tagField']
fields_names_to_populate = ['tagField', 'emailsubject', 'emailbody', "emailbodyhtml"]
fields_to_populate = [get_incidents_by_query_args.get(x, None) for x in fields_names_to_populate]
Expand All @@ -63,15 +56,15 @@ def get_incidents(d_args):


def preprocess_incidents(incidents, d_args):
text_pre_process_args = copy.deepcopy(d_args)
text_pre_process_args = d_args.copy()
text_pre_process_args['inputType'] = 'json_b64_string'
text_pre_process_args['input'] = base64.b64encode(incidents.encode('utf-8')).decode('ascii')
text_pre_process_args['preProcessType'] = 'nlp'
email_body_fields = [text_pre_process_args.get("emailbody"), text_pre_process_args.get("emailbodyhtml")]
email_body = "|".join([x for x in email_body_fields if x])
text_pre_process_args['textFields'] = "%s,%s" % (text_pre_process_args['emailsubject'], email_body)
text_pre_process_args['whitelistFields'] = "{0},{1}".format('dbot_processed_text',
text_pre_process_args['tagField'])
text_pre_process_args['textFields'] = "{},{}".format(text_pre_process_args['emailsubject'], email_body)
text_pre_process_args['whitelistFields'] = "{},{}".format('dbot_processed_text',
text_pre_process_args['tagField'])
res = demisto.executeCommand("DBotPreProcessTextData", text_pre_process_args)
if is_error(res):
return_error(get_error(res))
Expand All @@ -81,7 +74,7 @@ def preprocess_incidents(incidents, d_args):


def train_model(processed_text_data, d_args):
train_model_args = copy.deepcopy(d_args)
train_model_args = d_args.copy()
train_model_args['inputType'] = 'json_b64_string'
train_model_args['input'] = base64.b64encode(processed_text_data.encode('utf-8')).decode('ascii')
train_model_args['overrideExistingModel'] = 'true'
Expand All @@ -90,7 +83,7 @@ def train_model(processed_text_data, d_args):


def main():
d_args = dict(demisto.args())
d_args = demisto.args()
for arg in ['tagField', 'emailbody', 'emailbodyhtml', 'emailsubject', 'timeField']:
d_args[arg] = preprocess_incidents_field(d_args.get(arg, ''))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@ args:
- defaultValue: Phishing
description: A comma-separated list of incident types by which to filter.
name: incidentTypes
- description: 'The start date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200")'
- description: 'The start date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200").'
name: fromDate
- description: 'The end date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200")'
- description: 'The end date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200").'
name: toDate
- defaultValue: '3000'
description: The maximum number of incidents to fetch.
Expand Down Expand Up @@ -39,7 +39,7 @@ args:
- description: The model name to store in the system.
name: modelName
- defaultValue: '*'
description: 'A comma-separated list of email tags values and mapping. The script considers only the tags specified in this field. You can map a label to another value by using this format: LABEL:MAPPED_LABEL. For example, for 4 values in email tag: malicious, credentials harvesting, inner communitcation, external legit email, unclassified. While training, we want to ignore "unclassified" tag, and refer to "credentials harvesting" as "malicious" too. Also, we want to merge "inner communitcation" and "external legit email" to one tag called "non-malicious". The input will be: malicious, credentials harvesting:malicious, inner communitcation:non-malicious, external legit email:non-malicious'
description: 'A comma-separated list of email tags values and mapping. The script considers only the tags specified in this field. You can map a label to another value by using this format: LABEL:MAPPED_LABEL. For example, for 4 values in email tag: malicious, credentials harvesting, inner communitcation, external legit email, unclassified. While training, we want to ignore "unclassified" tag, and refer to "credentials harvesting" as "malicious" too. Also, we want to merge "inner communitcation" and "external legit email" to one tag called "non-malicious". The input will be: malicious, credentials harvesting:malicious, inner communitcation:non-malicious, external legit email:non-malicious.'
name: phishingLabels
- defaultValue: emailsubject
description: Incident field name with the email subject.
Expand Down Expand Up @@ -83,8 +83,7 @@ tags:
- ml
timeout: 12µs
type: python
dockerimage: demisto/ml:1.0.0.45981
runonce: true
dockerimage: demisto/python3:3.11.9.101916
tests:
- Create Phishing Classifier V2 ML Test
- DBotCreatePhishingClassifierV2FromFile-Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def test_no_mapping_no_query():
def test_no_mapping_with_query():
args = {'phishingLabels': '*', 'query': QUERY}
args = build_query_in_reepect_to_phishing_labels(args)
assert 'query' in args and args['query'] == QUERY
assert 'query' in args
assert args['query'] == QUERY


def test_mapping_no_query():
Expand All @@ -27,6 +28,6 @@ def test_mapping_with_query():
args = {'phishingLabels': MAPPING, 'tagField': 'closeReason', 'query': QUERY}
args = build_query_in_reepect_to_phishing_labels(args)
assert 'query' in args
opt1 = args['query'] == '({}) and (closeReason:("spam" "legit"))'.format(QUERY)
opt2 = args['query'] == '({}) and (closeReason:("legit" "spam"))'.format(QUERY)
opt1 = args['query'] == f'({QUERY}) and (closeReason:("spam" "legit"))'
opt2 = args['query'] == f'({QUERY}) and (closeReason:("legit" "spam"))'
assert opt1 or opt2
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,27 @@ script: '-'
subtype: python3
timeout: '0'
type: python
dockerimage: demisto/ml:1.0.0.94241
dockerimage: demisto/ml:1.0.0.101889
runas: DBotWeakRole
runonce: true
tests:
- No tests (auto formatted)
- DBotFindSimilarIncidents-test
fromversion: 5.0.0
outputs:
- contextPath: DBotFindSimilarIncidents.isSimilarIncidentFound
description: Indicates whether similar incidents have been found.
type: boolean
- contextPath: DBotFindSimilarIncidents.similarIncident.created
description: The creation date of the linked incident.
type: date
- contextPath: DBotFindSimilarIncidents.similarIncident.id
description: The ID of the linked incident.
type: string
- contextPath: DBotFindSimilarIncidents.similarIncident.name
description: The name of the linked incident.
type: string
- contextPath: DBotFindSimilarIncidents.similarIncident.similarity incident
description: The similarity of the linked incident represented as a float in the range 0-1.
type: number
- contextPath: DBotFindSimilarIncidents.similarIncident.details
description: The details of the linked incident.
type: string
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ script: '-'
subtype: python3
timeout: '0'
type: python
dockerimage: demisto/ml:1.0.0.88591
dockerimage: demisto/ml:1.0.0.101889
runas: DBotWeakRole
tests:
- DBotFindSimilarIncidentsByIndicators - Test
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# pylint: disable=no-member

from CommonServerPython import *
from string import punctuation
import demisto_ml
import numpy as np
import logging

# Suppress logging for a specific library
logging.getLogger('transformers').setLevel(logging.ERROR)

FASTTEXT_MODEL_TYPE = 'FASTTEXT_MODEL_TYPE'
TORCH_TYPE = 'torch'
Expand All @@ -14,27 +17,30 @@ def OrderedSet(iterable):
return list(dict.fromkeys(iterable))


def get_model_data(model_name, store_type, is_return_error):
res_model_list = demisto.executeCommand("getList", {"listName": model_name})[0]
res_model = demisto.executeCommand("getMLModel", {"modelName": model_name})[0]
if is_error(res_model_list) and not is_error(res_model):
model_data = res_model['Contents']['modelData']
try:
model_type = res_model['Contents']['model']["type"]["type"]
return model_data, model_type
except Exception:
return model_data, UNKNOWN_MODEL_TYPE
elif not is_error(res_model_list) and is_error(res_model):
return res_model_list["Contents"], UNKNOWN_MODEL_TYPE
elif not is_error(res_model_list) and not is_error(res_model):
if store_type == "list":
return res_model_list["Contents"], UNKNOWN_MODEL_TYPE
elif store_type == "mlModel":
model_data = res_model['Contents']['modelData']
model_type = res_model['Contents']['model']["type"]["type"]
return model_data, model_type
else:
handle_error("error reading model %s from Demisto" % model_name, is_return_error)
def get_model_data(model_name: str, store_type: str, is_return_error: bool) -> tuple[dict, str]:

def load_from_models(model_name: str) -> None | tuple[dict, str]:
res_model = demisto.executeCommand("getMLModel", {"modelName": model_name})
if is_error(res_model):
demisto.debug(get_error(res_model))
return None
model_data = res_model[0]['Contents']['modelData']
model_type = dict_safe_get(res_model, [0, 'Contents', 'model', "type", "type"], UNKNOWN_MODEL_TYPE)
return model_data, model_type

def load_from_list(model_name):
res_model = demisto.executeCommand("getList", {"listName": model_name})
if is_error(res_model):
demisto.debug(get_error(res_model))
return None
return res_model[0]["Contents"], UNKNOWN_MODEL_TYPE

if store_type == "mlModel":
res = load_from_models(model_name) or load_from_list(model_name)
elif store_type == "list":
res = load_from_list(model_name) or load_from_models(model_name)

return res or handle_error(f"error reading model {model_name} from Demisto", is_return_error) # type: ignore


def handle_error(message, is_return_error):
Expand Down Expand Up @@ -88,6 +94,7 @@ def preprocess_text(text, model_type, is_return_error):
else:
words_to_token_maps = tokenized_text_result['originalWordsToTokens']
return input_text, words_to_token_maps
return None


def predict_phishing_words(model_name, model_store_type, email_subject, email_body, min_text_length, label_threshold,
Expand All @@ -97,7 +104,9 @@ def predict_phishing_words(model_name, model_store_type, email_subject, email_bo
model_type = FASTTEXT_MODEL_TYPE
if model_type not in [FASTTEXT_MODEL_TYPE, TORCH_TYPE, UNKNOWN_MODEL_TYPE]:
model_type = UNKNOWN_MODEL_TYPE

phishing_model = demisto_ml.phishing_model_loads_handler(model_data, model_type)

is_model_applied_on_a_single_incidents = isinstance(email_subject, str) and isinstance(email_body, str)
if is_model_applied_on_a_single_incidents:
return predict_single_incident_full_output(email_subject, email_body, is_return_error, label_threshold,
Expand All @@ -110,7 +119,7 @@ def predict_phishing_words(model_name, model_store_type, email_subject, email_bo


def predict_batch_incidents_light_output(email_subject, email_body, phishing_model, model_type, min_text_length):
text_list = [{'text': "%s \n%s" % (subject, body)} for subject, body in zip(email_subject, email_body)]
text_list = [{'text': f"{subject} \n{body}"} for subject, body in zip(email_subject, email_body)]
preprocessed_text_list = preprocess_text(text_list, model_type, is_return_error=False)
batch_predictions = []
for input_text in preprocessed_text_list:
Expand All @@ -132,14 +141,14 @@ def predict_batch_incidents_light_output(email_subject, email_body, phishing_mod
'Type': entryTypes['note'],
'Contents': batch_predictions,
'ContentsFormat': formats['json'],
'HumanReadable': 'Applied predictions on {} incidents.'.format(len(batch_predictions)),
'HumanReadable': f'Applied predictions on {len(batch_predictions)} incidents.',
}


def predict_single_incident_full_output(email_subject, email_body, is_return_error, label_threshold, min_text_length,
model_type, phishing_model, set_incidents_fields, top_word_limit,
word_threshold):
text = "%s \n%s" % (email_subject, email_body)
text = f"{email_subject} \n{email_body}"
input_text, words_to_token_maps = preprocess_text(text, model_type, is_return_error)
filtered_text, filtered_text_number_of_words = phishing_model.filter_model_words(input_text)
if filtered_text_number_of_words == 0:
Expand All @@ -163,22 +172,22 @@ def predict_single_incident_full_output(email_subject, email_body, is_return_err
negative_tokens = OrderedSet(explain_result['NegativeWords'])
positive_words = find_words_contain_tokens(positive_tokens, words_to_token_maps)
negative_words = find_words_contain_tokens(negative_tokens, words_to_token_maps)
positive_words = list(OrderedSet([s.strip(punctuation) for s in positive_words]))
negative_words = list(OrderedSet([s.strip(punctuation) for s in negative_words]))
positive_words = OrderedSet([s.strip(punctuation) for s in positive_words])
negative_words = OrderedSet([s.strip(punctuation) for s in negative_words])
positive_words = [w for w in positive_words if w.isalnum()]
negative_words = [w for w in negative_words if w.isalnum()]
highlighted_text_markdown = text.strip()
for word in positive_words:
for cased_word in [word.lower(), word.title(), word.upper()]:
highlighted_text_markdown = re.sub(r'(?<!\w)({})(?!\w)'.format(cased_word), '**{}**'.format(cased_word),
highlighted_text_markdown = re.sub(fr'(?<!\w)({cased_word})(?!\w)', f'**{cased_word}**',
highlighted_text_markdown)
highlighted_text_markdown = re.sub(r'\n+', '\n', highlighted_text_markdown)
explain_result['PositiveWords'] = [w.lower() for w in positive_words]
explain_result['NegativeWords'] = [w.lower() for w in negative_words]
explain_result['OriginalText'] = text.strip()
explain_result['TextTokensHighlighted'] = highlighted_text_markdown
predicted_label = explain_result["Label"]
explain_result_hr = dict()
explain_result_hr = {}
explain_result_hr['TextTokensHighlighted'] = highlighted_text_markdown
explain_result_hr['Label'] = predicted_label
explain_result_hr['Probability'] = "%.2f" % predicted_prob
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,7 @@ tags:
- phishing
timeout: 60µs
type: python
dockerimage: demisto/ml:1.0.0.32340
runonce: true
dockerimage: demisto/ml:1.0.0.101889
tests:
- Create Phishing Classifier V2 ML Test
fromversion: 5.0.0
Expand Down
Loading

0 comments on commit 83ba6ce

Please sign in to comment.