Update docker ml (demisto#35081)

* updated docker * added the rest * devdemisto/ml:1.0.0.100486 * fix tpb * return on no incidents * remove runonce * remove space * fixed * fix create incidents script * new docker * revert: fix create incidents script * add outputs to DBotFindSimilarIncidents * new tpb DBotFindSimilarIncidents-test * new docker * bump transformers * Empty-Commit * fix conf.json * more fixes * more fixes * new docker * RN * new docker * revert dockers * more stuff * redirect stderr * docker * format * format * RN * more stuff * build fixes * build fixes * fix unit-tests * more docker changes * more docker changes * build fixes * suppress logger * build fixes * build fixes
SEKOIA-IO · Jul 11, 2024 · 83ba6ce · 83ba6ce
1 parent e3c3b60
commit 83ba6ce
Show file tree

Hide file tree

Showing 40 changed files with 1,144 additions and 917 deletions.
diff --git a/Packs/Base/ReleaseNotes/1_34_28.md b/Packs/Base/ReleaseNotes/1_34_28.md
@@ -0,0 +1,24 @@
+
+#### Scripts
+
+##### DBotTrainTextClassifierV2
+
+- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
+##### DBotBuildPhishingClassifier
+
+- Changed the Docker image to: *demisto/python3:3.11.9.101916*.
+##### DBotPreProcessTextData
+
+- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
+##### DBotPredictPhishingWords
+
+- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
+##### DBotFindSimilarIncidents
+
+- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
+##### GetMLModelEvaluation
+
+- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
+##### DBotFindSimilarIncidentsByIndicators
+
+- Updated the Docker image to: *demisto/ml:1.0.0.101889*.
diff --git a/Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier.py b/Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier.py
@@ -1,19 +1,12 @@
+from CommonServerPython import *
 import base64
-import copy
 import gc
 
-from CommonServerPython import *
-
-PREFIXES_TO_REMOVE = ['incident.']
 ALL_LABELS = "*"
 
 
 def preprocess_incidents_field(incidents_field):
-    incidents_field = incidents_field.strip()
-    for prefix in PREFIXES_TO_REMOVE:
-        if incidents_field.startswith(prefix):
-            incidents_field = incidents_field[len(prefix):]
-    return incidents_field
+    return incidents_field.strip().removeprefix('incident.')
 
 
 def get_phishing_map_labels(comma_values):
@@ -28,7 +21,7 @@ def get_phishing_map_labels(comma_values):
             labels_dict[splited[0].strip()] = splited[1].strip()
         else:
             labels_dict[v] = v
-    return {k: v for k, v in labels_dict.items()}
+    return dict(labels_dict.items())
 
 
 def build_query_in_reepect_to_phishing_labels(args):
@@ -38,17 +31,17 @@ def build_query_in_reepect_to_phishing_labels(args):
         return args
     mapping_dict = get_phishing_map_labels(mapping)
     tag_field = args['tagField']
-    tags_union = ' '.join(['"{}"'.format(label) for label in mapping_dict])
-    mapping_query = '{}:({})'.format(tag_field, tags_union)
+    tags_union = ' '.join([f'"{label}"' for label in mapping_dict])
+    mapping_query = f'{tag_field}:({tags_union})'
     if 'query' not in args or args['query'].strip() == '':
         args['query'] = mapping_query
     else:
-        args['query'] = '({}) and ({})'.format(query, mapping_query)
+        args['query'] = f'({query}) and ({mapping_query})'
     return args
 
 
 def get_incidents(d_args):
-    get_incidents_by_query_args = copy.deepcopy(d_args)
+    get_incidents_by_query_args = d_args.copy()
     get_incidents_by_query_args['NonEmptyFields'] = d_args['tagField']
     fields_names_to_populate = ['tagField', 'emailsubject', 'emailbody', "emailbodyhtml"]
     fields_to_populate = [get_incidents_by_query_args.get(x, None) for x in fields_names_to_populate]
@@ -63,15 +56,15 @@ def get_incidents(d_args):
 
 
 def preprocess_incidents(incidents, d_args):
-    text_pre_process_args = copy.deepcopy(d_args)
+    text_pre_process_args = d_args.copy()
     text_pre_process_args['inputType'] = 'json_b64_string'
     text_pre_process_args['input'] = base64.b64encode(incidents.encode('utf-8')).decode('ascii')
     text_pre_process_args['preProcessType'] = 'nlp'
     email_body_fields = [text_pre_process_args.get("emailbody"), text_pre_process_args.get("emailbodyhtml")]
     email_body = "|".join([x for x in email_body_fields if x])
-    text_pre_process_args['textFields'] = "%s,%s" % (text_pre_process_args['emailsubject'], email_body)
-    text_pre_process_args['whitelistFields'] = "{0},{1}".format('dbot_processed_text',
-                                                                text_pre_process_args['tagField'])
+    text_pre_process_args['textFields'] = "{},{}".format(text_pre_process_args['emailsubject'], email_body)
+    text_pre_process_args['whitelistFields'] = "{},{}".format('dbot_processed_text',
+                                                              text_pre_process_args['tagField'])
     res = demisto.executeCommand("DBotPreProcessTextData", text_pre_process_args)
     if is_error(res):
         return_error(get_error(res))
@@ -81,7 +74,7 @@ def preprocess_incidents(incidents, d_args):
 
 
 def train_model(processed_text_data, d_args):
-    train_model_args = copy.deepcopy(d_args)
+    train_model_args = d_args.copy()
     train_model_args['inputType'] = 'json_b64_string'
     train_model_args['input'] = base64.b64encode(processed_text_data.encode('utf-8')).decode('ascii')
     train_model_args['overrideExistingModel'] = 'true'
@@ -90,7 +83,7 @@ def train_model(processed_text_data, d_args):
 
 
 def main():
-    d_args = dict(demisto.args())
+    d_args = demisto.args()
     for arg in ['tagField', 'emailbody', 'emailbodyhtml', 'emailsubject', 'timeField']:
         d_args[arg] = preprocess_incidents_field(d_args.get(arg, ''))
 

diff --git a/Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier.yml b/Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier.yml
@@ -4,9 +4,9 @@ args:
 - defaultValue: Phishing
   description: A comma-separated list of incident types by which to filter.
   name: incidentTypes
-- description: 'The start date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200")'
+- description: 'The start date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200").'
   name: fromDate
-- description: 'The end date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200")'
+- description: 'The end date by which to filter incidents. Date format will be the same as in the incidents query page (valid strings example: "3 days ago", ""2019-01-01T00:00:00 +0200").'
   name: toDate
 - defaultValue: '3000'
   description: The maximum number of incidents to fetch.
@@ -39,7 +39,7 @@ args:
 - description: The model name to store in the system.
   name: modelName
 - defaultValue: '*'
-  description: 'A comma-separated list of email tags values and mapping. The script considers only the tags specified in this field. You can map a label to another value by using this format: LABEL:MAPPED_LABEL. For example, for 4 values in email tag: malicious, credentials harvesting, inner communitcation, external legit email, unclassified. While training, we want to ignore "unclassified" tag, and refer to "credentials harvesting" as "malicious" too. Also, we want to merge "inner communitcation" and "external legit email" to one tag called "non-malicious". The input will be: malicious, credentials harvesting:malicious, inner communitcation:non-malicious, external legit email:non-malicious'
+  description: 'A comma-separated list of email tags values and mapping. The script considers only the tags specified in this field. You can map a label to another value by using this format: LABEL:MAPPED_LABEL. For example, for 4 values in email tag: malicious, credentials harvesting, inner communitcation, external legit email, unclassified. While training, we want to ignore "unclassified" tag, and refer to "credentials harvesting" as "malicious" too. Also, we want to merge "inner communitcation" and "external legit email" to one tag called "non-malicious". The input will be: malicious, credentials harvesting:malicious, inner communitcation:non-malicious, external legit email:non-malicious.'
   name: phishingLabels
 - defaultValue: emailsubject
   description: Incident field name with the email subject.
@@ -83,8 +83,7 @@ tags:
 - ml
 timeout: 12µs
 type: python
-dockerimage: demisto/ml:1.0.0.45981
-runonce: true
+dockerimage: demisto/python3:3.11.9.101916
 tests:
 - Create Phishing Classifier V2 ML Test
 - DBotCreatePhishingClassifierV2FromFile-Test

diff --git a/Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier_test.py b/Packs/Base/Scripts/DBotBuildPhishingClassifier/DBotBuildPhishingClassifier_test.py
@@ -13,7 +13,8 @@ def test_no_mapping_no_query():
 def test_no_mapping_with_query():
     args = {'phishingLabels': '*', 'query': QUERY}
     args = build_query_in_reepect_to_phishing_labels(args)
-    assert 'query' in args and args['query'] == QUERY
+    assert 'query' in args
+    assert args['query'] == QUERY
 
 
 def test_mapping_no_query():
@@ -27,6 +28,6 @@ def test_mapping_with_query():
     args = {'phishingLabels': MAPPING, 'tagField': 'closeReason', 'query': QUERY}
     args = build_query_in_reepect_to_phishing_labels(args)
     assert 'query' in args
-    opt1 = args['query'] == '({}) and (closeReason:("spam" "legit"))'.format(QUERY)
-    opt2 = args['query'] == '({}) and (closeReason:("legit" "spam"))'.format(QUERY)
+    opt1 = args['query'] == f'({QUERY}) and (closeReason:("spam" "legit"))'
+    opt2 = args['query'] == f'({QUERY}) and (closeReason:("legit" "spam"))'
     assert opt1 or opt2
diff --git a/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.yml b/Packs/Base/Scripts/DBotFindSimilarIncidents/DBotFindSimilarIncidents.yml
@@ -86,9 +86,27 @@ script: '-'
 subtype: python3
 timeout: '0'
 type: python
-dockerimage: demisto/ml:1.0.0.94241
+dockerimage: demisto/ml:1.0.0.101889
 runas: DBotWeakRole
-runonce: true
 tests:
-- No tests (auto formatted)
+- DBotFindSimilarIncidents-test
 fromversion: 5.0.0
+outputs:
+- contextPath: DBotFindSimilarIncidents.isSimilarIncidentFound
+  description: Indicates whether similar incidents have been found.
+  type: boolean
+- contextPath: DBotFindSimilarIncidents.similarIncident.created
+  description: The creation date of the linked incident.
+  type: date
+- contextPath: DBotFindSimilarIncidents.similarIncident.id
+  description: The ID of the linked incident.
+  type: string
+- contextPath: DBotFindSimilarIncidents.similarIncident.name
+  description: The name of the linked incident.
+  type: string
+- contextPath: DBotFindSimilarIncidents.similarIncident.similarity incident
+  description: The similarity of the linked incident represented as a float in the range 0-1.
+  type: number
+- contextPath: DBotFindSimilarIncidents.similarIncident.details
+  description: The details of the linked incident.
+  type: string
diff --git a/...ase/Scripts/DBotFindSimilarIncidentsByIndicators/DBotFindSimilarIncidentsByIndicators.yml b/...ase/Scripts/DBotFindSimilarIncidentsByIndicators/DBotFindSimilarIncidentsByIndicators.yml
@@ -42,7 +42,7 @@ script: '-'
 subtype: python3
 timeout: '0'
 type: python
-dockerimage: demisto/ml:1.0.0.88591
+dockerimage: demisto/ml:1.0.0.101889
 runas: DBotWeakRole
 tests:
 - DBotFindSimilarIncidentsByIndicators - Test

diff --git a/Packs/Base/Scripts/DBotPredictPhishingWords/DBotPredictPhishingWords.py b/Packs/Base/Scripts/DBotPredictPhishingWords/DBotPredictPhishingWords.py
@@ -1,9 +1,12 @@
 # pylint: disable=no-member
-
 from CommonServerPython import *
 from string import punctuation
 import demisto_ml
 import numpy as np
+import logging
+
+# Suppress logging for a specific library
+logging.getLogger('transformers').setLevel(logging.ERROR)
 
 FASTTEXT_MODEL_TYPE = 'FASTTEXT_MODEL_TYPE'
 TORCH_TYPE = 'torch'
@@ -14,27 +17,30 @@ def OrderedSet(iterable):
     return list(dict.fromkeys(iterable))
 
 
-def get_model_data(model_name, store_type, is_return_error):
-    res_model_list = demisto.executeCommand("getList", {"listName": model_name})[0]
-    res_model = demisto.executeCommand("getMLModel", {"modelName": model_name})[0]
-    if is_error(res_model_list) and not is_error(res_model):
-        model_data = res_model['Contents']['modelData']
-        try:
-            model_type = res_model['Contents']['model']["type"]["type"]
-            return model_data, model_type
-        except Exception:
-            return model_data, UNKNOWN_MODEL_TYPE
-    elif not is_error(res_model_list) and is_error(res_model):
-        return res_model_list["Contents"], UNKNOWN_MODEL_TYPE
-    elif not is_error(res_model_list) and not is_error(res_model):
-        if store_type == "list":
-            return res_model_list["Contents"], UNKNOWN_MODEL_TYPE
-        elif store_type == "mlModel":
-            model_data = res_model['Contents']['modelData']
-            model_type = res_model['Contents']['model']["type"]["type"]
-            return model_data, model_type
-    else:
-        handle_error("error reading model %s from Demisto" % model_name, is_return_error)
+def get_model_data(model_name: str, store_type: str, is_return_error: bool) -> tuple[dict, str]:
+
+    def load_from_models(model_name: str) -> None | tuple[dict, str]:
+        res_model = demisto.executeCommand("getMLModel", {"modelName": model_name})
+        if is_error(res_model):
+            demisto.debug(get_error(res_model))
+            return None
+        model_data = res_model[0]['Contents']['modelData']
+        model_type = dict_safe_get(res_model, [0, 'Contents', 'model', "type", "type"], UNKNOWN_MODEL_TYPE)
+        return model_data, model_type
+
+    def load_from_list(model_name):
+        res_model = demisto.executeCommand("getList", {"listName": model_name})
+        if is_error(res_model):
+            demisto.debug(get_error(res_model))
+            return None
+        return res_model[0]["Contents"], UNKNOWN_MODEL_TYPE
+
+    if store_type == "mlModel":
+        res = load_from_models(model_name) or load_from_list(model_name)
+    elif store_type == "list":
+        res = load_from_list(model_name) or load_from_models(model_name)
+
+    return res or handle_error(f"error reading model {model_name} from Demisto", is_return_error)  # type: ignore
 
 
 def handle_error(message, is_return_error):
@@ -88,6 +94,7 @@ def preprocess_text(text, model_type, is_return_error):
         else:
             words_to_token_maps = tokenized_text_result['originalWordsToTokens']
         return input_text, words_to_token_maps
+    return None
 
 
 def predict_phishing_words(model_name, model_store_type, email_subject, email_body, min_text_length, label_threshold,
@@ -97,7 +104,9 @@ def predict_phishing_words(model_name, model_store_type, email_subject, email_bo
         model_type = FASTTEXT_MODEL_TYPE
     if model_type not in [FASTTEXT_MODEL_TYPE, TORCH_TYPE, UNKNOWN_MODEL_TYPE]:
         model_type = UNKNOWN_MODEL_TYPE
+
     phishing_model = demisto_ml.phishing_model_loads_handler(model_data, model_type)
+
     is_model_applied_on_a_single_incidents = isinstance(email_subject, str) and isinstance(email_body, str)
     if is_model_applied_on_a_single_incidents:
         return predict_single_incident_full_output(email_subject, email_body, is_return_error, label_threshold,
@@ -110,7 +119,7 @@ def predict_phishing_words(model_name, model_store_type, email_subject, email_bo
 
 
 def predict_batch_incidents_light_output(email_subject, email_body, phishing_model, model_type, min_text_length):
-    text_list = [{'text': "%s \n%s" % (subject, body)} for subject, body in zip(email_subject, email_body)]
+    text_list = [{'text': f"{subject} \n{body}"} for subject, body in zip(email_subject, email_body)]
     preprocessed_text_list = preprocess_text(text_list, model_type, is_return_error=False)
     batch_predictions = []
     for input_text in preprocessed_text_list:
@@ -132,14 +141,14 @@ def predict_batch_incidents_light_output(email_subject, email_body, phishing_mod
         'Type': entryTypes['note'],
         'Contents': batch_predictions,
         'ContentsFormat': formats['json'],
-        'HumanReadable': 'Applied predictions on {} incidents.'.format(len(batch_predictions)),
+        'HumanReadable': f'Applied predictions on {len(batch_predictions)} incidents.',
     }
 
 
 def predict_single_incident_full_output(email_subject, email_body, is_return_error, label_threshold, min_text_length,
                                         model_type, phishing_model, set_incidents_fields, top_word_limit,
                                         word_threshold):
-    text = "%s \n%s" % (email_subject, email_body)
+    text = f"{email_subject} \n{email_body}"
     input_text, words_to_token_maps = preprocess_text(text, model_type, is_return_error)
     filtered_text, filtered_text_number_of_words = phishing_model.filter_model_words(input_text)
     if filtered_text_number_of_words == 0:
@@ -163,22 +172,22 @@ def predict_single_incident_full_output(email_subject, email_body, is_return_err
     negative_tokens = OrderedSet(explain_result['NegativeWords'])
     positive_words = find_words_contain_tokens(positive_tokens, words_to_token_maps)
     negative_words = find_words_contain_tokens(negative_tokens, words_to_token_maps)
-    positive_words = list(OrderedSet([s.strip(punctuation) for s in positive_words]))
-    negative_words = list(OrderedSet([s.strip(punctuation) for s in negative_words]))
+    positive_words = OrderedSet([s.strip(punctuation) for s in positive_words])
+    negative_words = OrderedSet([s.strip(punctuation) for s in negative_words])
     positive_words = [w for w in positive_words if w.isalnum()]
     negative_words = [w for w in negative_words if w.isalnum()]
     highlighted_text_markdown = text.strip()
     for word in positive_words:
         for cased_word in [word.lower(), word.title(), word.upper()]:
-            highlighted_text_markdown = re.sub(r'(?<!\w)({})(?!\w)'.format(cased_word), '**{}**'.format(cased_word),
+            highlighted_text_markdown = re.sub(fr'(?<!\w)({cased_word})(?!\w)', f'**{cased_word}**',
                                                highlighted_text_markdown)
     highlighted_text_markdown = re.sub(r'\n+', '\n', highlighted_text_markdown)
     explain_result['PositiveWords'] = [w.lower() for w in positive_words]
     explain_result['NegativeWords'] = [w.lower() for w in negative_words]
     explain_result['OriginalText'] = text.strip()
     explain_result['TextTokensHighlighted'] = highlighted_text_markdown
     predicted_label = explain_result["Label"]
-    explain_result_hr = dict()
+    explain_result_hr = {}
     explain_result_hr['TextTokensHighlighted'] = highlighted_text_markdown
     explain_result_hr['Label'] = predicted_label
     explain_result_hr['Probability'] = "%.2f" % predicted_prob

diff --git a/Packs/Base/Scripts/DBotPredictPhishingWords/DBotPredictPhishingWords.yml b/Packs/Base/Scripts/DBotPredictPhishingWords/DBotPredictPhishingWords.yml
@@ -98,8 +98,7 @@ tags:
 - phishing
 timeout: 60µs
 type: python
-dockerimage: demisto/ml:1.0.0.32340
-runonce: true
+dockerimage: demisto/ml:1.0.0.101889
 tests:
 - Create Phishing Classifier V2 ML Test
 fromversion: 5.0.0