Merge pull request #3 from center-for-threat-informed-defense/matt/in…

…itial-model Matt/initial model
center-for-threat-informed-defense · Mar 5, 2024 · 92563a6 · 92563a6
2 parents 42c55aa + 72b7878
commit 92563a6
Show file tree

Hide file tree

Showing 11 changed files with 730,644 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,5 @@ docs/_build/
 .mypy_cache/
 *.tmp
 TODO*
+.DS_Store
+.pre-commit-config.yaml
diff --git a/data/combined_dataset_full_frequency.json b/data/combined_dataset_full_frequency.json
@@ -6942,4 +6942,4 @@
             "origin_of_data": "adversary emulation plans"
         }
     ]
-}
+}
diff --git a/enterprise-attack.json b/enterprise-attack.json
diff --git a/models/__init__.py b/models/__init__.py
diff --git a/models/main.ipynb b/models/main.ipynb
@@ -0,0 +1,189 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "From /opt/homebrew/anaconda3/envs/tie/lib/python3.11/site-packages/tensorflow/python/compat/v2_compat.py:108: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "non-resource variables are not supported in the long term\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Imports\n",
+    "import json\n",
+    "from mitreattack.stix20 import MitreAttackData\n",
+    "import tensorflow as tf\n",
+    "from recommender import FactorizationRecommender\n",
+    "import random\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_mitre_technique_ids(stix_filepath: str) -> frozenset[str]:\n",
+    "    \"\"\"Gets all MITRE technique ids.\"\"\"\n",
+    "    mitre_attack_data = MitreAttackData(stix_filepath)\n",
+    "    techniques = mitre_attack_data.get_techniques(remove_revoked_deprecated=True)\n",
+    "\n",
+    "    all_technique_ids = set()\n",
+    "\n",
+    "    for technique in techniques:\n",
+    "        external_references = technique.get(\"external_references\")\n",
+    "        mitre_references = tuple(filter(lambda external_reference: external_reference.get(\"source_name\") == \"mitre-attack\", external_references))\n",
+    "        assert len(mitre_references) == 1\n",
+    "        mitre_technique_id = mitre_references[0][\"external_id\"]\n",
+    "        all_technique_ids.add(mitre_technique_id)\n",
+    "\n",
+    "    return frozenset(all_technique_ids)\n",
+    "\n",
+    "def get_campaign_techniques(filepath: str) -> tuple[frozenset[str]]:\n",
+    "    \"\"\"Gets a set of MITRE technique ids present in each campaign.\"\"\"\n",
+    "\n",
+    "    with open(filepath) as f:\n",
+    "        data = json.load(f)\n",
+    "\n",
+    "    campaigns = data[\"bags_of_techniques\"]\n",
+    "\n",
+    "    ret = []\n",
+    "\n",
+    "    for campaign in campaigns:\n",
+    "\n",
+    "        techniques = campaign[\"mitre_techniques\"]\n",
+    "        ret.append(frozenset(techniques.keys()))\n",
+    "\n",
+    "    return ret\n",
+    "\n",
+    "def train_test_split(indices: list, values: list, test_ratio: float=0.1) -> tuple:\n",
+    "    n = len(indices)\n",
+    "    assert len(values) == n\n",
+    "\n",
+    "    indices_for_test_set = frozenset(random.sample(range(n), k=math.floor(test_ratio * n)))\n",
+    "\n",
+    "    train_indices = []\n",
+    "    test_indices = []\n",
+    "    train_values = []\n",
+    "    test_values = []\n",
+    "\n",
+    "    for i in range(n):\n",
+    "        if i in indices_for_test_set:\n",
+    "            test_indices.append(indices[i])\n",
+    "            test_values.append(values[i])\n",
+    "        else:\n",
+    "            train_indices.append(indices[i])\n",
+    "            train_values.append(values[i])\n",
+    "\n",
+    "    return train_indices, train_values, test_indices, test_values\n",
+    "\n",
+    "\n",
+    "def main():\n",
+    "    # want matrix of campaigns on horizontal, techniques on vertical\n",
+    "    all_mitre_technique_ids = tuple(get_mitre_technique_ids(\"../enterprise-attack.json\"))\n",
+    "    mitre_technique_ids_to_index = {all_mitre_technique_ids[i]: i for i in range(len(all_mitre_technique_ids))}\n",
+    "\n",
+    "    campaigns = get_campaign_techniques(\"../data/combined_dataset_full_frequency.json\")\n",
+    "\n",
+    "    indices = []\n",
+    "    values = []\n",
+    "\n",
+    "    # for each campaign, make a vector, filling in each present technique with a 1\n",
+    "    for i in range(len(campaigns)):\n",
+    "\n",
+    "        campaign = campaigns[i]\n",
+    "\n",
+    "        for mitre_technique_id in campaign:\n",
+    "            if mitre_technique_id in mitre_technique_ids_to_index:\n",
+    "                # campaign id, technique id\n",
+    "                index = [i, mitre_technique_ids_to_index[mitre_technique_id]]\n",
+    "\n",
+    "                indices.append(index)\n",
+    "                values.append(1)\n",
+    "\n",
+    "    train_indices, train_values, test_indices, test_values = train_test_split(indices, values)\n",
+    "\n",
+    "    training_data = tf.SparseTensor(\n",
+    "        indices=train_indices,\n",
+    "        values=train_values,\n",
+    "        dense_shape=(len(campaigns), len(all_mitre_technique_ids))\n",
+    "    )\n",
+    "    test_data = tf.SparseTensor(\n",
+    "        indices=test_indices,\n",
+    "        values=test_values,\n",
+    "        dense_shape=(len(campaigns), len(all_mitre_technique_ids))\n",
+    "    )\n",
+    "\n",
+    "    # train\n",
+    "    model = FactorizationRecommender(m=len(campaigns), n=len(all_mitre_technique_ids), k=10)\n",
+    "    model.fit(training_data, num_iterations=1000, learning_rate=10.)\n",
+    "\n",
+    "    evaluation = model.evaluate(test_data)\n",
+    "    print(evaluation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "From /Users/mjturner/code/technique-inference-engine/models/recommender/factorization_recommender.py:115: start_queue_runners (from tensorflow.python.training.queue_runner_impl) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "To construct input pipelines, use the `tf.data` module.\n",
+      "`tf.train.start_queue_runners()` was called when no queue runners were defined. You can safely remove the call to this deprecated function.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-02-19 14:16:11.445364: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.34506088\n"
+     ]
+    }
+   ],
+   "source": [
+    "main()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tie",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/models/recommender/__init__.py b/models/recommender/__init__.py
@@ -0,0 +1,5 @@
+from .factorization_recommender import FactorizationRecommender
+
+__all__ = [
+    "FactorizationRecommender",
+]
diff --git a/models/recommender/factorization_recommender.py b/models/recommender/factorization_recommender.py
@@ -0,0 +1,153 @@
+# Code adapted from https://colab.research.google.com/github/google/eng-edu/blob/main/ml/recommendation-systems/recommendation-systems.ipynb?utm_source=ss-recommendation-systems&utm_campaign=colab-external&utm_medium=referral&utm_content=recommendation-systems
+
+import collections
+import tensorflow as tf
+
+tf.compat.v1.disable_v2_behavior()
+tf.compat.v1.disable_eager_execution()
+
+
+class FactorizationRecommender:
+    """A matrix factorization collaborative filtering recommender model."""
+
+    # Abstraction function:
+    #   AF(m, n, k) = a matrix factorization recommender model
+    #       on m entities, n items to recommend, and
+    #       embedding dimension k (a hyperparameter)
+    # Rep invariant:
+    #   - U.shape[1] == V.shape[1]
+    #   - U and V are 2D
+    #   - U.shape[0] > 0
+    #   - U.shape[1] > 0
+    #   - V.shape[0] > 0
+    #   - V.shape[1] > 0
+    # Safety from rep exposure:
+    #   - U and V are private and not reassigned
+
+    def __init__(self, m, n, k):
+        """Initializes a FactorizationRecommender object.
+
+        Args:
+            m: number of individual embeddings
+            n: number of item embeddings
+            k: embedding dimension
+        """
+        init_stddev = 0.5
+
+        U = tf.Variable(tf.random.normal([m, k], stddev=init_stddev))
+        V = tf.Variable(tf.random.normal([n, k], stddev=init_stddev))
+
+        self._session = None
+        self._U = U
+        self._V = V
+
+        self._checkrep()
+
+    def _checkrep(self):
+        """Asserts the rep invariant."""
+        #   - U.shape[1] == V.shape[1]
+        assert self._U.shape[1] == self._V.shape[1]
+        #   - U and V are 2D
+        assert len(self._U.shape) == 2
+        assert len(self._V.shape) == 2
+        #   - U.shape[0] > 0
+        assert self._U.shape[0] > 0
+        #   - U.shape[1] > 0
+        assert self._U.shape[1] > 0
+        #   - V.shape[0] > 0
+        assert self._V.shape[0] > 0
+        #   - V.shape[1] > 0
+        assert self._V.shape[1] > 0
+
+    @tf.function
+    def _calculate_mean_square_error(
+        self, data: tf.SparseTensor, U: tf.Tensor, V: tf.Tensor
+    ):
+        """Calculates the mean squared error between observed values in the
+        data and predictions from UV^T.
+
+        MSE = \sum_{(i,j) \in \Omega} (data_{ij} U_i \dot V_j)^2
+        where Omega is the set of observed entries in training_data.
+
+        Args:
+            data: A matrix of observations of dense_shape [N, M]
+            UY: A dense Tensor of shape [N, k] where k is the embedding
+            dimension, such that U_i is the embedding of element i.
+            V: A dense Tensor of shape [M, k] where k is the embedding
+            dimension, such that V_j is the embedding of element j.
+        Returns:
+            A scalar Tensor representing the MSE between the true ratings and the
+            model's predictions.
+        """
+        predictions = tf.gather_nd(tf.matmul(U, V, transpose_b=True), data.indices)
+        loss = tf.losses.mean_squared_error(data.values, predictions)
+        return loss
+
+    def fit(self, data: tf.SparseTensor, num_iterations: int, learning_rate: float):
+        """Fits the model to data.
+
+        Args:
+            data: an mxn tensor of training data
+            num_iterations: number of training iterations to execute
+            learning_rate: the learning rate
+        """
+        # preliminaries
+        optimizer = tf.compat.v1.train.GradientDescentOptimizer
+
+        loss = self._calculate_mean_square_error(data, self._U, self._V)
+        metrics = [
+            {
+                "train_error": loss,
+            }
+        ]
+
+        with loss.graph.as_default():
+            opt = optimizer(learning_rate)
+            # TODO what is impact of defining loss lcoally rather than class var
+            train_op = opt.minimize(loss)
+            local_init_op = tf.group(
+                tf.compat.v1.variables_initializer(opt.variables()),
+                tf.compat.v1.local_variables_initializer(),
+            )
+            if self._session is None:
+                self._session = tf.compat.v1.Session()
+                with self._session.as_default():
+                    self._session.run(tf.compat.v1.global_variables_initializer())
+                    self._session.run(tf.compat.v1.tables_initializer())
+                    tf.compat.v1.train.start_queue_runners()
+
+        with self._session.as_default():
+            local_init_op.run()
+            iterations = []
+            # metrics = self._metrics or ({},)
+            metrics_vals = [collections.defaultdict(list) for _ in metrics]
+
+            # Train and append results.
+            for i in range(num_iterations + 1):
+                _, results = self._session.run((train_op, metrics))
+                if (i % 10 == 0) or i == num_iterations:
+                    iterations.append(i)
+                    for metric_val, result in zip(metrics_vals, results):
+                        for k, v in result.items():
+                            metric_val[k].append(v)
+
+    def evaluate(self, test_data: tf.SparseTensor) -> float:
+        """Evaluates the solution.
+
+        Requires that the model has been trained.
+
+        Args:
+            test_data: mxn tensor on which to evaluate the model.
+                Requires that mxn match the dimensions of the training tensor and
+                each row i and column j correspond to the same entity and item
+                in the training tensor, respectively.
+
+        Returns: the mean squared error of the test data.
+        """
+
+        with self._session as sess:
+            error = self._calculate_mean_square_error(
+                test_data, self._U, self._V
+            ).eval()
+
+        return error