-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from center-for-threat-informed-defense/matt/in…
…itial-model Matt/initial model
- Loading branch information
Showing
11 changed files
with
730,644 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,3 +8,5 @@ docs/_build/ | |
.mypy_cache/ | ||
*.tmp | ||
TODO* | ||
.DS_Store | ||
.pre-commit-config.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6942,4 +6942,4 @@ | |
"origin_of_data": "adversary emulation plans" | ||
} | ||
] | ||
} | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"From /opt/homebrew/anaconda3/envs/tie/lib/python3.11/site-packages/tensorflow/python/compat/v2_compat.py:108: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n", | ||
"Instructions for updating:\n", | ||
"non-resource variables are not supported in the long term\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Imports\n", | ||
"import json\n", | ||
"from mitreattack.stix20 import MitreAttackData\n", | ||
"import tensorflow as tf\n", | ||
"from recommender import FactorizationRecommender\n", | ||
"import random\n", | ||
"import math" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_mitre_technique_ids(stix_filepath: str) -> frozenset[str]:\n", | ||
" \"\"\"Gets all MITRE technique ids.\"\"\"\n", | ||
" mitre_attack_data = MitreAttackData(stix_filepath)\n", | ||
" techniques = mitre_attack_data.get_techniques(remove_revoked_deprecated=True)\n", | ||
"\n", | ||
" all_technique_ids = set()\n", | ||
"\n", | ||
" for technique in techniques:\n", | ||
" external_references = technique.get(\"external_references\")\n", | ||
" mitre_references = tuple(filter(lambda external_reference: external_reference.get(\"source_name\") == \"mitre-attack\", external_references))\n", | ||
" assert len(mitre_references) == 1\n", | ||
" mitre_technique_id = mitre_references[0][\"external_id\"]\n", | ||
" all_technique_ids.add(mitre_technique_id)\n", | ||
"\n", | ||
" return frozenset(all_technique_ids)\n", | ||
"\n", | ||
"def get_campaign_techniques(filepath: str) -> tuple[frozenset[str]]:\n", | ||
" \"\"\"Gets a set of MITRE technique ids present in each campaign.\"\"\"\n", | ||
"\n", | ||
" with open(filepath) as f:\n", | ||
" data = json.load(f)\n", | ||
"\n", | ||
" campaigns = data[\"bags_of_techniques\"]\n", | ||
"\n", | ||
" ret = []\n", | ||
"\n", | ||
" for campaign in campaigns:\n", | ||
"\n", | ||
" techniques = campaign[\"mitre_techniques\"]\n", | ||
" ret.append(frozenset(techniques.keys()))\n", | ||
"\n", | ||
" return ret\n", | ||
"\n", | ||
"def train_test_split(indices: list, values: list, test_ratio: float=0.1) -> tuple:\n", | ||
" n = len(indices)\n", | ||
" assert len(values) == n\n", | ||
"\n", | ||
" indices_for_test_set = frozenset(random.sample(range(n), k=math.floor(test_ratio * n)))\n", | ||
"\n", | ||
" train_indices = []\n", | ||
" test_indices = []\n", | ||
" train_values = []\n", | ||
" test_values = []\n", | ||
"\n", | ||
" for i in range(n):\n", | ||
" if i in indices_for_test_set:\n", | ||
" test_indices.append(indices[i])\n", | ||
" test_values.append(values[i])\n", | ||
" else:\n", | ||
" train_indices.append(indices[i])\n", | ||
" train_values.append(values[i])\n", | ||
"\n", | ||
" return train_indices, train_values, test_indices, test_values\n", | ||
"\n", | ||
"\n", | ||
"def main():\n", | ||
" # want matrix of campaigns on horizontal, techniques on vertical\n", | ||
" all_mitre_technique_ids = tuple(get_mitre_technique_ids(\"../enterprise-attack.json\"))\n", | ||
" mitre_technique_ids_to_index = {all_mitre_technique_ids[i]: i for i in range(len(all_mitre_technique_ids))}\n", | ||
"\n", | ||
" campaigns = get_campaign_techniques(\"../data/combined_dataset_full_frequency.json\")\n", | ||
"\n", | ||
" indices = []\n", | ||
" values = []\n", | ||
"\n", | ||
" # for each campaign, make a vector, filling in each present technique with a 1\n", | ||
" for i in range(len(campaigns)):\n", | ||
"\n", | ||
" campaign = campaigns[i]\n", | ||
"\n", | ||
" for mitre_technique_id in campaign:\n", | ||
" if mitre_technique_id in mitre_technique_ids_to_index:\n", | ||
" # campaign id, technique id\n", | ||
" index = [i, mitre_technique_ids_to_index[mitre_technique_id]]\n", | ||
"\n", | ||
" indices.append(index)\n", | ||
" values.append(1)\n", | ||
"\n", | ||
" train_indices, train_values, test_indices, test_values = train_test_split(indices, values)\n", | ||
"\n", | ||
" training_data = tf.SparseTensor(\n", | ||
" indices=train_indices,\n", | ||
" values=train_values,\n", | ||
" dense_shape=(len(campaigns), len(all_mitre_technique_ids))\n", | ||
" )\n", | ||
" test_data = tf.SparseTensor(\n", | ||
" indices=test_indices,\n", | ||
" values=test_values,\n", | ||
" dense_shape=(len(campaigns), len(all_mitre_technique_ids))\n", | ||
" )\n", | ||
"\n", | ||
" # train\n", | ||
" model = FactorizationRecommender(m=len(campaigns), n=len(all_mitre_technique_ids), k=10)\n", | ||
" model.fit(training_data, num_iterations=1000, learning_rate=10.)\n", | ||
"\n", | ||
" evaluation = model.evaluate(test_data)\n", | ||
" print(evaluation)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"From /Users/mjturner/code/technique-inference-engine/models/recommender/factorization_recommender.py:115: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.\n", | ||
"Instructions for updating:\n", | ||
"To construct input pipelines, use the `tf.data` module.\n", | ||
"`tf.train.start_queue_runners()` was called when no queue runners were defined. You can safely remove the call to this deprecated function.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"2024-02-19 14:16:11.445364: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"0.34506088\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"main()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "tie", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .factorization_recommender import FactorizationRecommender | ||
|
||
__all__ = [ | ||
"FactorizationRecommender", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# Code adapted from https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems | ||
|
||
import collections | ||
import tensorflow as tf | ||
|
||
tf.compat.v1.disable_v2_behavior() | ||
tf.compat.v1.disable_eager_execution() | ||
|
||
|
||
class FactorizationRecommender: | ||
"""A matrix factorization collaborative filtering recommender model.""" | ||
|
||
# Abstraction function: | ||
# AF(m, n, k) = a matrix factorization recommender model | ||
# on m entities, n items to recommend, and | ||
# embedding dimension k (a hyperparameter) | ||
# Rep invariant: | ||
# - U.shape[1] == V.shape[1] | ||
# - U and V are 2D | ||
# - U.shape[0] > 0 | ||
# - U.shape[1] > 0 | ||
# - V.shape[0] > 0 | ||
# - V.shape[1] > 0 | ||
# Safety from rep exposure: | ||
# - U and V are private and not reassigned | ||
|
||
def __init__(self, m, n, k): | ||
"""Initializes a FactorizationRecommender object. | ||
Args: | ||
m: number of individual embeddings | ||
n: number of item embeddings | ||
k: embedding dimension | ||
""" | ||
init_stddev = 0.5 | ||
|
||
U = tf.Variable(tf.random.normal([m, k], stddev=init_stddev)) | ||
V = tf.Variable(tf.random.normal([n, k], stddev=init_stddev)) | ||
|
||
self._session = None | ||
self._U = U | ||
self._V = V | ||
|
||
self._checkrep() | ||
|
||
def _checkrep(self): | ||
"""Asserts the rep invariant.""" | ||
# - U.shape[1] == V.shape[1] | ||
assert self._U.shape[1] == self._V.shape[1] | ||
# - U and V are 2D | ||
assert len(self._U.shape) == 2 | ||
assert len(self._V.shape) == 2 | ||
# - U.shape[0] > 0 | ||
assert self._U.shape[0] > 0 | ||
# - U.shape[1] > 0 | ||
assert self._U.shape[1] > 0 | ||
# - V.shape[0] > 0 | ||
assert self._V.shape[0] > 0 | ||
# - V.shape[1] > 0 | ||
assert self._V.shape[1] > 0 | ||
|
||
@tf.function | ||
def _calculate_mean_square_error( | ||
self, data: tf.SparseTensor, U: tf.Tensor, V: tf.Tensor | ||
): | ||
"""Calculates the mean squared error between observed values in the | ||
data and predictions from UV^T. | ||
MSE = \sum_{(i,j) \in \Omega} (data_{ij} U_i \dot V_j)^2 | ||
where Omega is the set of observed entries in training_data. | ||
Args: | ||
data: A matrix of observations of dense_shape [N, M] | ||
UY: A dense Tensor of shape [N, k] where k is the embedding | ||
dimension, such that U_i is the embedding of element i. | ||
V: A dense Tensor of shape [M, k] where k is the embedding | ||
dimension, such that V_j is the embedding of element j. | ||
Returns: | ||
A scalar Tensor representing the MSE between the true ratings and the | ||
model's predictions. | ||
""" | ||
predictions = tf.gather_nd(tf.matmul(U, V, transpose_b=True), data.indices) | ||
loss = tf.losses.mean_squared_error(data.values, predictions) | ||
return loss | ||
|
||
def fit(self, data: tf.SparseTensor, num_iterations: int, learning_rate: float): | ||
"""Fits the model to data. | ||
Args: | ||
data: an mxn tensor of training data | ||
num_iterations: number of training iterations to execute | ||
learning_rate: the learning rate | ||
""" | ||
# preliminaries | ||
optimizer = tf.compat.v1.train.GradientDescentOptimizer | ||
|
||
loss = self._calculate_mean_square_error(data, self._U, self._V) | ||
metrics = [ | ||
{ | ||
"train_error": loss, | ||
} | ||
] | ||
|
||
with loss.graph.as_default(): | ||
opt = optimizer(learning_rate) | ||
# TODO what is impact of defining loss lcoally rather than class var | ||
train_op = opt.minimize(loss) | ||
local_init_op = tf.group( | ||
tf.compat.v1.variables_initializer(opt.variables()), | ||
tf.compat.v1.local_variables_initializer(), | ||
) | ||
if self._session is None: | ||
self._session = tf.compat.v1.Session() | ||
with self._session.as_default(): | ||
self._session.run(tf.compat.v1.global_variables_initializer()) | ||
self._session.run(tf.compat.v1.tables_initializer()) | ||
tf.compat.v1.train.start_queue_runners() | ||
|
||
with self._session.as_default(): | ||
local_init_op.run() | ||
iterations = [] | ||
# metrics = self._metrics or ({},) | ||
metrics_vals = [collections.defaultdict(list) for _ in metrics] | ||
|
||
# Train and append results. | ||
for i in range(num_iterations + 1): | ||
_, results = self._session.run((train_op, metrics)) | ||
if (i % 10 == 0) or i == num_iterations: | ||
iterations.append(i) | ||
for metric_val, result in zip(metrics_vals, results): | ||
for k, v in result.items(): | ||
metric_val[k].append(v) | ||
|
||
def evaluate(self, test_data: tf.SparseTensor) -> float: | ||
"""Evaluates the solution. | ||
Requires that the model has been trained. | ||
Args: | ||
test_data: mxn tensor on which to evaluate the model. | ||
Requires that mxn match the dimensions of the training tensor and | ||
each row i and column j correspond to the same entity and item | ||
in the training tensor, respectively. | ||
Returns: the mean squared error of the test data. | ||
""" | ||
|
||
with self._session as sess: | ||
error = self._calculate_mean_square_error( | ||
test_data, self._U, self._V | ||
).eval() | ||
|
||
return error |
Oops, something went wrong.