Skip to content

Commit

Permalink
Merge pull request #3 from center-for-threat-informed-defense/matt/in…
Browse files Browse the repository at this point in the history
…itial-model

Matt/initial model
  • Loading branch information
jlasky2 authored Mar 5, 2024
2 parents 42c55aa + 72b7878 commit 92563a6
Show file tree
Hide file tree
Showing 11 changed files with 730,644 additions and 3 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ docs/_build/
.mypy_cache/
*.tmp
TODO*
.DS_Store
.pre-commit-config.yaml
2 changes: 1 addition & 1 deletion data/combined_dataset_full_frequency.json
Original file line number Diff line number Diff line change
Expand Up @@ -6942,4 +6942,4 @@
"origin_of_data": "adversary emulation plans"
}
]
}
}
727,499 changes: 727,499 additions & 0 deletions enterprise-attack.json

Large diffs are not rendered by default.

Empty file added models/__init__.py
Empty file.
189 changes: 189 additions & 0 deletions models/main.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"From /opt/homebrew/anaconda3/envs/tie/lib/python3.11/site-packages/tensorflow/python/compat/v2_compat.py:108: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"non-resource variables are not supported in the long term\n"
]
}
],
"source": [
"# Imports\n",
"import json\n",
"from mitreattack.stix20 import MitreAttackData\n",
"import tensorflow as tf\n",
"from recommender import FactorizationRecommender\n",
"import random\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def get_mitre_technique_ids(stix_filepath: str) -> frozenset[str]:\n",
" \"\"\"Gets all MITRE technique ids.\"\"\"\n",
" mitre_attack_data = MitreAttackData(stix_filepath)\n",
" techniques = mitre_attack_data.get_techniques(remove_revoked_deprecated=True)\n",
"\n",
" all_technique_ids = set()\n",
"\n",
" for technique in techniques:\n",
" external_references = technique.get(\"external_references\")\n",
" mitre_references = tuple(filter(lambda external_reference: external_reference.get(\"source_name\") == \"mitre-attack\", external_references))\n",
" assert len(mitre_references) == 1\n",
" mitre_technique_id = mitre_references[0][\"external_id\"]\n",
" all_technique_ids.add(mitre_technique_id)\n",
"\n",
" return frozenset(all_technique_ids)\n",
"\n",
"def get_campaign_techniques(filepath: str) -> tuple[frozenset[str]]:\n",
" \"\"\"Gets a set of MITRE technique ids present in each campaign.\"\"\"\n",
"\n",
" with open(filepath) as f:\n",
" data = json.load(f)\n",
"\n",
" campaigns = data[\"bags_of_techniques\"]\n",
"\n",
" ret = []\n",
"\n",
" for campaign in campaigns:\n",
"\n",
" techniques = campaign[\"mitre_techniques\"]\n",
" ret.append(frozenset(techniques.keys()))\n",
"\n",
" return ret\n",
"\n",
"def train_test_split(indices: list, values: list, test_ratio: float=0.1) -> tuple:\n",
" n = len(indices)\n",
" assert len(values) == n\n",
"\n",
" indices_for_test_set = frozenset(random.sample(range(n), k=math.floor(test_ratio * n)))\n",
"\n",
" train_indices = []\n",
" test_indices = []\n",
" train_values = []\n",
" test_values = []\n",
"\n",
" for i in range(n):\n",
" if i in indices_for_test_set:\n",
" test_indices.append(indices[i])\n",
" test_values.append(values[i])\n",
" else:\n",
" train_indices.append(indices[i])\n",
" train_values.append(values[i])\n",
"\n",
" return train_indices, train_values, test_indices, test_values\n",
"\n",
"\n",
"def main():\n",
" # want matrix of campaigns on horizontal, techniques on vertical\n",
" all_mitre_technique_ids = tuple(get_mitre_technique_ids(\"../enterprise-attack.json\"))\n",
" mitre_technique_ids_to_index = {all_mitre_technique_ids[i]: i for i in range(len(all_mitre_technique_ids))}\n",
"\n",
" campaigns = get_campaign_techniques(\"../data/combined_dataset_full_frequency.json\")\n",
"\n",
" indices = []\n",
" values = []\n",
"\n",
" # for each campaign, make a vector, filling in each present technique with a 1\n",
" for i in range(len(campaigns)):\n",
"\n",
" campaign = campaigns[i]\n",
"\n",
" for mitre_technique_id in campaign:\n",
" if mitre_technique_id in mitre_technique_ids_to_index:\n",
" # campaign id, technique id\n",
" index = [i, mitre_technique_ids_to_index[mitre_technique_id]]\n",
"\n",
" indices.append(index)\n",
" values.append(1)\n",
"\n",
" train_indices, train_values, test_indices, test_values = train_test_split(indices, values)\n",
"\n",
" training_data = tf.SparseTensor(\n",
" indices=train_indices,\n",
" values=train_values,\n",
" dense_shape=(len(campaigns), len(all_mitre_technique_ids))\n",
" )\n",
" test_data = tf.SparseTensor(\n",
" indices=test_indices,\n",
" values=test_values,\n",
" dense_shape=(len(campaigns), len(all_mitre_technique_ids))\n",
" )\n",
"\n",
" # train\n",
" model = FactorizationRecommender(m=len(campaigns), n=len(all_mitre_technique_ids), k=10)\n",
" model.fit(training_data, num_iterations=1000, learning_rate=10.)\n",
"\n",
" evaluation = model.evaluate(test_data)\n",
" print(evaluation)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"From /Users/mjturner/code/technique-inference-engine/models/recommender/factorization_recommender.py:115: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"To construct input pipelines, use the `tf.data` module.\n",
"`tf.train.start_queue_runners()` was called when no queue runners were defined. You can safely remove the call to this deprecated function.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-02-19 14:16:11.445364: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.34506088\n"
]
}
],
"source": [
"main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "tie",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
5 changes: 5 additions & 0 deletions models/recommender/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .factorization_recommender import FactorizationRecommender

__all__ = [
"FactorizationRecommender",
]
153 changes: 153 additions & 0 deletions models/recommender/factorization_recommender.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Code adapted from https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems

import collections
import tensorflow as tf

tf.compat.v1.disable_v2_behavior()
tf.compat.v1.disable_eager_execution()


class FactorizationRecommender:
"""A matrix factorization collaborative filtering recommender model."""

# Abstraction function:
# AF(m, n, k) = a matrix factorization recommender model
# on m entities, n items to recommend, and
# embedding dimension k (a hyperparameter)
# Rep invariant:
# - U.shape[1] == V.shape[1]
# - U and V are 2D
# - U.shape[0] > 0
# - U.shape[1] > 0
# - V.shape[0] > 0
# - V.shape[1] > 0
# Safety from rep exposure:
# - U and V are private and not reassigned

def __init__(self, m, n, k):
"""Initializes a FactorizationRecommender object.
Args:
m: number of individual embeddings
n: number of item embeddings
k: embedding dimension
"""
init_stddev = 0.5

U = tf.Variable(tf.random.normal([m, k], stddev=init_stddev))
V = tf.Variable(tf.random.normal([n, k], stddev=init_stddev))

self._session = None
self._U = U
self._V = V

self._checkrep()

def _checkrep(self):
"""Asserts the rep invariant."""
# - U.shape[1] == V.shape[1]
assert self._U.shape[1] == self._V.shape[1]
# - U and V are 2D
assert len(self._U.shape) == 2
assert len(self._V.shape) == 2
# - U.shape[0] > 0
assert self._U.shape[0] > 0
# - U.shape[1] > 0
assert self._U.shape[1] > 0
# - V.shape[0] > 0
assert self._V.shape[0] > 0
# - V.shape[1] > 0
assert self._V.shape[1] > 0

@tf.function
def _calculate_mean_square_error(
self, data: tf.SparseTensor, U: tf.Tensor, V: tf.Tensor
):
"""Calculates the mean squared error between observed values in the
data and predictions from UV^T.
MSE = \sum_{(i,j) \in \Omega} (data_{ij} U_i \dot V_j)^2
where Omega is the set of observed entries in training_data.
Args:
data: A matrix of observations of dense_shape [N, M]
UY: A dense Tensor of shape [N, k] where k is the embedding
dimension, such that U_i is the embedding of element i.
V: A dense Tensor of shape [M, k] where k is the embedding
dimension, such that V_j is the embedding of element j.
Returns:
A scalar Tensor representing the MSE between the true ratings and the
model's predictions.
"""
predictions = tf.gather_nd(tf.matmul(U, V, transpose_b=True), data.indices)
loss = tf.losses.mean_squared_error(data.values, predictions)
return loss

def fit(self, data: tf.SparseTensor, num_iterations: int, learning_rate: float):
"""Fits the model to data.
Args:
data: an mxn tensor of training data
num_iterations: number of training iterations to execute
learning_rate: the learning rate
"""
# preliminaries
optimizer = tf.compat.v1.train.GradientDescentOptimizer

loss = self._calculate_mean_square_error(data, self._U, self._V)
metrics = [
{
"train_error": loss,
}
]

with loss.graph.as_default():
opt = optimizer(learning_rate)
# TODO what is impact of defining loss lcoally rather than class var
train_op = opt.minimize(loss)
local_init_op = tf.group(
tf.compat.v1.variables_initializer(opt.variables()),
tf.compat.v1.local_variables_initializer(),
)
if self._session is None:
self._session = tf.compat.v1.Session()
with self._session.as_default():
self._session.run(tf.compat.v1.global_variables_initializer())
self._session.run(tf.compat.v1.tables_initializer())
tf.compat.v1.train.start_queue_runners()

with self._session.as_default():
local_init_op.run()
iterations = []
# metrics = self._metrics or ({},)
metrics_vals = [collections.defaultdict(list) for _ in metrics]

# Train and append results.
for i in range(num_iterations + 1):
_, results = self._session.run((train_op, metrics))
if (i % 10 == 0) or i == num_iterations:
iterations.append(i)
for metric_val, result in zip(metrics_vals, results):
for k, v in result.items():
metric_val[k].append(v)

def evaluate(self, test_data: tf.SparseTensor) -> float:
"""Evaluates the solution.
Requires that the model has been trained.
Args:
test_data: mxn tensor on which to evaluate the model.
Requires that mxn match the dimensions of the training tensor and
each row i and column j correspond to the same entity and item
in the training tensor, respectively.
Returns: the mean squared error of the test data.
"""

with self._session as sess:
error = self._calculate_mean_square_error(
test_data, self._U, self._V
).eval()

return error
Loading

0 comments on commit 92563a6

Please sign in to comment.