From ccd7feb5033369bf6ed39b4c60303c37e9798357 Mon Sep 17 00:00:00 2001 From: aMahanna Date: Fri, 5 Nov 2021 16:22:21 -0400 Subject: [PATCH] update: notebooks (also used nbstripout) --- examples/ArangoDB_NetworkxAdapter.ipynb | 960 +++--- examples/IMDB_Networkx_Adapter.ipynb | 3528 ++++++++------------- examples/ITSM_ArangoDB_Adapter.ipynb | 70 +- examples/ITSM_EDA.ipynb | 579 ++-- examples/batch_graph_pre_processing.ipynb | 887 +++--- 5 files changed, 2513 insertions(+), 3511 deletions(-) diff --git a/examples/ArangoDB_NetworkxAdapter.ipynb b/examples/ArangoDB_NetworkxAdapter.ipynb index 2e03ce5c..cb4b0f30 100644 --- a/examples/ArangoDB_NetworkxAdapter.ipynb +++ b/examples/ArangoDB_NetworkxAdapter.ipynb @@ -1,531 +1,435 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "q8KesL7xeG89" + }, + "source": [ + "# ArangoDB NetworkX Adapter Getting Started Guide " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U1d45V4OeG89" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nx9aa3LAeG89" + }, + "source": [ + "![networkX](https://github.com/arangoml/networkx-adapter/blob/master/examples/assets/logos/networkx_logo.svg?raw=1) \n", + "![arangodb](https://github.com/arangoml/networkx-adapter/blob/master/examples/assets/logos/ArangoDB_logo.png?raw=1) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bpvZS-1aeG89" + }, + "source": [ + "Version: 0.0.0.2.5.3\n", + "\n", + "In this Notebook we learn how to export Graphs from [ArangoDB](https://www.arangodb.com/), a multi-model Graph Database into [NetworkX](https://networkx.github.io/), the swiss army knife for graph analysis ion python." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KS9c-vE5eG89" + }, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fUnFAFAheG89" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!git clone -b oasis_connector --single-branch https://github.com/arangodb/interactive_tutorials.git\n", + "!git clone -b 0.0.0.2.5.3 --single-branch https://github.com/arangoml/networkx-adapter.git\n", + "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n", + "!rsync -av interactive_tutorials/ ./ --exclude=.git\n", + "!pip3 install adbnx_adapter=0.0.0.2.5.3\n", + "!pip3 install networkx\n", + "!pip3 install matplotlib\n", + "!pip3 install pyarango\n", + "!pip3 install python-arango" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RpqvL4COeG8-" + }, + "outputs": [], + "source": [ + "import json\n", + "import oasis\n", + "import networkx as nx\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "from adbnx_adapter.arangoDB_networkx_adapter import ArangoDB_Networkx_Adapter" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Oc__NAd1eG8-" + }, + "source": [ + "# Create a Temporary ArangoDB Instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "name": "ArangoDB_NetworkxAdapter.ipynb", - "provenance": [] - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "q8KesL7xeG89" - }, - "source": [ - "# ArangoDB NetworkX Adapter Getting Started Guide " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U1d45V4OeG89" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Nx9aa3LAeG89" - }, - "source": [ - "![networkX](https://github.com/arangoml/networkx-adapter/blob/master/examples/assets/logos/networkx_logo.svg?raw=1) \n", - "![arangodb](https://github.com/arangoml/networkx-adapter/blob/master/examples/assets/logos/ArangoDB_logo.png?raw=1) " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bpvZS-1aeG89" - }, - "source": [ - "In this Notebook we learn how to export Graphs from [ArangoDB](https://www.arangodb.com/), a multi-model Graph Database into [NetworkX](https://networkx.github.io/), the swiss army knife for graph analysis ion python." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KS9c-vE5eG89" - }, - "source": [ - "# Setup" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "fUnFAFAheG89" - }, - "source": [ - "%%capture\n", - "!git clone https://github.com/arangoml/networkx-adapter.git\n", - "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n", - "!pip3 install networkx\n", - "!pip3 install matplotlib\n", - "!pip3 install adbnx_adapter\n", - "!pip3 install pyarango\n", - "!pip3 install python-arango" - ], - "execution_count": 1, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "RpqvL4COeG8-" - }, - "source": [ - "import json\n", - "import oasis\n", - "import networkx as nx\n", - "import matplotlib.pyplot as plt\n", - "\n", - "\n", - "from pyArango.connection import *\n", - "from adbnx_adapter.arangoDB_networkx_adapter import ArangoDB_Networkx_Adapter" - ], - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Oc__NAd1eG8-" - }, - "source": [ - "# Create a Temporary ArangoDB Instance" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2ekGwnJDeG8-", - "outputId": "c0839114-a489-4fba-d438-198d930cdb90" - }, - "source": [ - "# Request temporary instance from the managed ArangoDB Cloud Oasis.\n", - "con = oasis.getTempCredentials()\n", - "\n", - "# Connect the driver to the temp database\n", - "conn = oasis.connect(con)\n", - "db = conn[con[\"dbName\"]]\n", - "\n", - "print()\n", - "print(\"https://{}:{}\".format(con[\"hostname\"], con[\"port\"]))\n", - "print(\"Username: \" + con[\"username\"])\n", - "print(\"Password: \" + con[\"password\"])\n", - "print(\"Database: \" + con[\"dbName\"])" - ], - "execution_count": 6, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Requesting new temp credentials.\n", - "Temp database ready to use.\n", - "\n", - "https://d383fa0b596a.arangodb.cloud:8529\n", - "Username: TUTeuvmxk8xo0ua5biimu39fk\n", - "Password: TUToh5apk2ldjy7b1upbsead\n", - "Database: TUTqjuhef9nl5q0xv2e75yl\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e4QfL37neG8_" - }, - "source": [ - "Feel free to use to above URL to checkout the UI!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7y81WHO8eG8_" - }, - "source": [ - "# Data Import" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BM0iRYPDeG8_" - }, - "source": [ - "We will use an Fraud Detection example graph, explained in more detail in this [interactive notebook](https://colab.research.google.com/github/joerg84/Graph_Powered_ML_Workshop/blob/master/Fraud_Detection.ipynb)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1jWclaDdeG8_" - }, - "source": [ - "*Note the included arangorestore will only work on Linux system, if you want to run this notebook on a different OS please consider using the appropriate arangorestore from the [Download area](https://www.arangodb.com/download-major/).*" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7bgGJ3QkeG8_", - "outputId": "4715de16-766a-4902-a132-d53bc59d4f63" - }, - "source": [ - "!./tools/arangorestore -c none --server.endpoint http+ssl://{con[\"hostname\"]}:{con[\"port\"]} --server.username {con[\"username\"]} --server.database {con[\"dbName\"]} --server.password {con[\"password\"]} --default-replication-factor 3 --input-directory \"data/fraud_dump\"" - ], - "execution_count": 7, - "outputs": [ - { - "output_type": "stream", - "text": [ - "\u001b[0m2020-11-19T12:37:54Z [190] INFO [05c30] {restore} Connected to ArangoDB 'http+ssl://d383fa0b596a.arangodb.cloud:8529'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:55Z [190] INFO [abeb4] {restore} Database name in source dump is 'fraud-detection'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:55Z [190] INFO [9b414] {restore} # Re-creating document collection 'account'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:55Z [190] INFO [9b414] {restore} # Re-creating document collection 'bank'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:55Z [190] INFO [9b414] {restore} # Re-creating document collection 'branch'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:56Z [190] INFO [9b414] {restore} # Re-creating document collection 'Class'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:56Z [190] INFO [9b414] {restore} # Re-creating document collection 'customer'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:56Z [190] INFO [9b414] {restore} # Re-creating edge collection 'accountHolder'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:57Z [190] INFO [9b414] {restore} # Re-creating edge collection 'Relationship'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:57Z [190] INFO [9b414] {restore} # Re-creating edge collection 'transaction'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [f723c] {restore} # Creating views...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6d69f] {restore} # Dispatched 8 job(s), using 2 worker(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into document collection 'bank', data size: 183 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [d88c6] {restore} # Creating indexes for collection 'account'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored document collection 'bank'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into document collection 'branch', data size: 465 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored document collection 'branch'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into document collection 'Class', data size: 196 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored document collection 'Class'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [d88c6] {restore} # Creating indexes for collection 'customer'...\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into document collection 'account', data size: 1696 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into document collection 'customer', data size: 794 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored document collection 'account'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into edge collection 'accountHolder', data size: 1076 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored document collection 'customer'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into edge collection 'Relationship', data size: 275 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored edge collection 'accountHolder'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [94913] {restore} # Loading data into edge collection 'transaction', data size: 2292 byte(s)\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored edge collection 'Relationship'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [6ae09] {restore} # Successfully restored edge collection 'transaction'\n", - "\u001b[0m\u001b[0m2020-11-19T12:37:58Z [190] INFO [a66e1] {restore} Processed 8 collection(s) in 4.271091 s, read 49781 byte(s) from datafiles, sent 8 data batch(es) of 49773 byte(s) total size\n", - "\u001b[0m" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "227hLXnPeG8_" - }, - "source": [ - "# Create Graph" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "howeguvmeG8_" - }, - "source": [ - "The graph we will be using in the following looks as follows:" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WqRlqnJCeG8_" - }, - "source": [ - "![networkX](https://github.com/arangoml/networkx-adapter/blob/master/examples/assets/fraud_graph.jpeg?raw=1) " - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PybHP7jpeG8_", - "outputId": "0c59cdb9-e67c-4e18-9791-69fdb4edbd6a" - }, - "source": [ - "from pyArango.collection import Collection, Edges, Field\n", - "from pyArango.graph import Graph, EdgeDefinition\n", - "\n", - "\n", - "class account(Collection):\n", - " _fields = {\n", - " \"Name\": Field()\n", - " }\n", - " \n", - "class customer(Collection):\n", - " _fields = {\n", - " \"Name\": Field()\n", - " }\n", - " \n", - "class transaction(Edges): \n", - " _fields = {\n", - " }\n", - "\n", - "class accountHolder(Edges): \n", - " _fields = {\n", - " }\n", - "\n", - "class FraudDetection(Graph) :\n", - " _edgeDefinitions = [EdgeDefinition(\"accountHolder\", fromCollections=[\"customer\"], toCollections=[\"account\"]),EdgeDefinition(\"transaction\", fromCollections=[\"account\"], toCollections=[\"account\"])]\n", - " _orphanedCollections = []\n", - "\n", - "fraudGraph = db.createGraph(\"FraudDetection\")\n", - "\n", - "print(\"Collection/Graph Setup done.\")" - ], - "execution_count": 8, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collection/Graph Setup done.\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ANrsn9GreG9A" - }, - "source": [ - "Feel free to visit the ArangoDB UI using the above link and login data and check the Graph!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QfE_tKxneG9A" - }, - "source": [ - "# Connect ArangoDB and NetworkX " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kGfhzPT9eG9A" - }, - "source": [ - "We first connect the ArangoDB_Networkx_Adapter to our temp ArangoDB cluster:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oG496kBeeG9A" - }, - "source": [ - "ma = ArangoDB_Networkx_Adapter(conn = con)" - ], - "execution_count": 9, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uByvwf9feG9A" - }, - "source": [ - "Next, we need to define the attributes in the vertex and edge collections to be included:\n", - "\n", - "*Note, we are currently working on making this step optional in the future!*" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "UWX9-MsKeG9A" - }, - "source": [ - "fraud_detection_attributes = { 'vertexCollections': \n", - " {'account': {'Balance', 'account_type', 'customer_id', 'rank'},\\\n", - " 'bank': {'Country', 'Id', 'bank_id', 'bank_name'},\\\n", - " 'branch':{'City', 'Country', 'Id', 'bank_id', 'branch_id', 'branch_name'},\\\n", - " 'Class':{'concrete', 'label', 'name'},\\\n", - " 'customer': {'Name', 'Sex', 'Ssn', 'rank'}},\\\n", - " 'edgeCollections' : \n", - " {'accountHolder': {'_from', '_to'},\\\n", - " 'Relationship': {'_from', '_to', 'label', 'name', 'relationshipType'},\\\n", - " 'transaction': {'_from', '_to'}}}" - ], - "execution_count": 11, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5pC59IV-eG9A" - }, - "source": [ - "Now, we can export the networkX graph:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "hPp6n66reG9A" - }, - "source": [ - "g = ma.create_networkx_graph(graph_name = 'FraudDetection', graph_attributes = fraud_detection_attributes)" - ], - "execution_count": 12, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gsDza0PBeG9A" - }, - "source": [ - "From here on we can simply use all networkX functionality:" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 319 - }, - "id": "cMlIdO9NeG9A", - "outputId": "e41f8e2a-b0d3-4009-f95e-083afff1b213" - }, - "source": [ - "nx.draw(g, with_labels=True)" - ], - "execution_count": 13, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "KJmyw3JAeG9A", - "outputId": "d53d232d-132b-490d-8a3b-67607489b168" - }, - "source": [ - "g.nodes()" - ], - "execution_count": 14, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "NodeView(('account/6149748', 'account/10000041', 'account/10000043', 'account/6149781', 'account/10000036', 'account/10000040', 'account/10000042', 'account/10000001', 'account/10000035', 'account/10000027', 'account/10000044', 'account/10000008', 'account/1000054', 'account/4149551', 'account/10000034', 'account/10000015', 'account/1000053', 'account/10000014', 'account/1000050', 'account/10000003', 'account/10000022', 'account/10000017', 'account/6149795', 'account/10000030', 'account/10000020', 'account/10000013', 'account/10000039', 'account/10000029', 'account/10000019', 'account/10000028', 'account/10000037', 'account/10000012', 'account/10000024', 'account/orphan_Account_1', 'account/10000006', 'account/10000033', 'account/10000031', 'account/1000052', 'account/10000026', 'account/10000011', 'account/10000018', 'account/10000010', 'account/10000016', 'account/10000002', 'account/10000004', 'account/10000009', 'account/10000025', 'account/10000038', 'account/10000032', 'account/10000023', 'account/10000005', 'account/1000051', 'account/10000021', 'account/10000007', 'bank/1548225', 'bank/1548224', 'bank/1548226', 'branch/1548208', 'branch/1548209', 'branch/1548205', 'branch/1548211', 'branch/1548206', 'branch/1548210', 'branch/1548207', 'branch/1548212', 'branch/1548204', 'branch/1548203', 'Class/customer', 'Class/account', 'Class/bank', 'Class/branch', 'customer/10810', 'customer/10000001', 'customer/10000002', 'customer/10000012', 'customer/10000008', 'customer/10000010', 'customer/10000015', 'customer/10000006', 'customer/10000013', 'customer/10000009', 'customer/10000011', 'customer/10000016', 'customer/10000007', 'customer/10000003', 'customer/10000005', 'customer/10000004', 'customer/10000014'))" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 14 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "eNxI-ctteG9A", - "outputId": "edaf5f28-22d8-4586-a43e-d955b48b8940" - }, - "source": [ - "g.edges()" - ], - "execution_count": 15, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "OutEdgeView([('account/6149748', 'customer/10810'), ('account/10000041', 'customer/10000016'), ('account/10000041', 'account/10000043'), ('account/10000043', 'customer/10000016'), ('account/10000043', 'account/10000044'), ('account/6149781', 'customer/10810'), ('account/10000036', 'customer/10000013'), ('account/10000036', 'account/10000041'), ('account/10000040', 'customer/10000015'), ('account/10000040', 'account/10000043'), ('account/10000042', 'customer/10000015'), ('account/10000042', 'account/10000043'), ('account/10000001', 'customer/10000008'), ('account/10000035', 'customer/10000012'), ('account/10000035', 'account/10000040'), ('account/10000027', 'customer/10000002'), ('account/10000027', 'account/10000015'), ('account/10000044', 'customer/10000003'), ('account/10000044', 'account/1000050'), ('account/10000008', 'customer/10000010'), ('account/10000008', 'account/10000010'), ('account/1000054', 'customer/10000016'), ('account/1000054', 'account/10000032'), ('account/4149551', 'customer/10000001'), ('account/10000034', 'customer/10000012'), ('account/10000034', 'account/10000039'), ('account/10000015', 'customer/10000007'), ('account/10000015', 'account/10000030'), ('account/1000053', 'customer/10000014'), ('account/1000053', 'account/1000054'), ('account/10000014', 'customer/10000006'), ('account/10000014', 'account/10000023'), ('account/10000014', 'account/10000001'), ('account/1000050', 'customer/10000003'), ('account/1000050', 'account/1000051'), ('account/10000003', 'customer/10000004'), ('account/10000003', 'account/10000028'), ('account/10000003', 'account/10000003'), ('account/10000022', 'customer/10000006'), ('account/10000022', 'account/10000021'), ('account/10000017', 'customer/10000002'), ('account/10000017', 'account/10000027'), ('account/10000017', 'account/10000007'), ('account/6149795', 'customer/10810'), ('account/10000030', 'customer/10000006'), ('account/10000030', 'account/10000005'), ('account/10000020', 'customer/10000010'), ('account/10000020', 'account/10000014'), ('account/10000013', 'customer/10000008'), ('account/10000013', 'account/10000015'), ('account/10000039', 'customer/10000015'), ('account/10000039', 'account/10000043'), ('account/10000029', 'customer/10000010'), ('account/10000029', 'account/10000031'), ('account/10000019', 'customer/10000009'), ('account/10000028', 'customer/10000006'), ('account/10000037', 'customer/10000014'), ('account/10000037', 'account/10000042'), ('account/10000012', 'customer/10000002'), ('account/10000012', 'account/10000026'), ('account/10000012', 'account/10000021'), ('account/10000012', 'account/10000001'), ('account/10000024', 'customer/10000006'), ('account/10000024', 'account/10000030'), ('account/10000024', 'account/10000008'), ('account/orphan_Account_1', 'customer/10810'), ('account/10000006', 'customer/10000002'), ('account/10000006', 'account/10000018'), ('account/10000006', 'account/10000008'), ('account/10000006', 'account/10000003'), ('account/10000033', 'customer/10000011'), ('account/10000033', 'account/10000038'), ('account/10000031', 'customer/10000002'), ('account/10000031', 'account/10000027'), ('account/10000031', 'account/10000018'), ('account/1000052', 'customer/10000011'), ('account/1000052', 'account/1000053'), ('account/10000026', 'customer/10000008'), ('account/10000011', 'customer/10000009'), ('account/10000011', 'account/10000007'), ('account/10000011', 'account/10000023'), ('account/10000018', 'customer/10000010'), ('account/10000018', 'account/10000025'), ('account/10000010', 'customer/10000007'), ('account/10000010', 'account/10000004'), ('account/10000016', 'customer/10000004'), ('account/10000016', 'account/10000015'), ('account/10000002', 'customer/10000005'), ('account/10000002', 'account/10000001'), ('account/10000002', 'account/10000028'), ('account/10000004', 'customer/10000006'), ('account/10000009', 'customer/10000009'), ('account/10000009', 'account/10000006'), ('account/10000025', 'customer/10000010'), ('account/10000025', 'account/10000019'), ('account/10000038', 'customer/10000014'), ('account/10000038', 'account/10000043'), ('account/10000032', 'customer/10000011'), ('account/10000032', 'account/10000037'), ('account/10000032', 'account/10000035'), ('account/10000032', 'account/10000033'), ('account/10000032', 'account/10000034'), ('account/10000032', 'account/10000036'), ('account/10000023', 'customer/10000005'), ('account/10000023', 'account/10000003'), ('account/10000023', 'account/10000016'), ('account/10000005', 'customer/10000002'), ('account/10000005', 'account/10000013'), ('account/1000051', 'customer/10000013'), ('account/1000051', 'account/1000052'), ('account/10000021', 'customer/10000005'), ('account/10000021', 'account/10000026'), ('account/10000021', 'account/10000018'), ('account/10000007', 'customer/10000009'), ('account/10000007', 'account/10000025'), ('Class/customer', 'Class/branch'), ('Class/account', 'Class/account'), ('Class/account', 'Class/customer'), ('Class/branch', 'Class/bank')])" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 15 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "k9xyOIQ9eG9A" - }, - "source": [ - "# Next Steps" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ntiBo6X_eG9A" - }, - "source": [ - "* Explore [Interactive NetworkX Notebook](https://colab.research.google.com/github/joerg84/Graph_Powered_ML_Workshop/blob/master/NetworkX.ipynb)\n", - "* Explore [ArangoDB](https://www.arangodb.com/)\n", - "* Explore [Interactive ArangoDB tutorials](https://github.com/joerg84/ArangoDBUniversity#arangodb-university)" - ] - } - ] + "base_uri": "https://localhost:8080/" + }, + "id": "2ekGwnJDeG8-", + "outputId": "c0839114-a489-4fba-d438-198d930cdb90" + }, + "outputs": [], + "source": [ + "# Request temporary instance from the managed ArangoDB Cloud Oasis.\n", + "con = oasis.getTempCredentials()\n", + "\n", + "# Connect the driver to the temp database\n", + "conn = oasis.connect(con)\n", + "db = conn[con[\"dbName\"]]\n", + "\n", + "print()\n", + "print(\"https://{}:{}\".format(con[\"hostname\"], con[\"port\"]))\n", + "print(\"Username: \" + con[\"username\"])\n", + "print(\"Password: \" + con[\"password\"])\n", + "print(\"Database: \" + con[\"dbName\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e4QfL37neG8_" + }, + "source": [ + "Feel free to use to above URL to checkout the UI!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7y81WHO8eG8_" + }, + "source": [ + "# Data Import" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BM0iRYPDeG8_" + }, + "source": [ + "We will use an Fraud Detection example graph, explained in more detail in this [interactive notebook](https://colab.research.google.com/github/joerg84/Graph_Powered_ML_Workshop/blob/master/Fraud_Detection.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1jWclaDdeG8_" + }, + "source": [ + "*Note the included arangorestore will only work on Linux system, if you want to run this notebook on a different OS please consider using the appropriate arangorestore from the [Download area](https://www.arangodb.com/download-major/).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7bgGJ3QkeG8_", + "outputId": "4715de16-766a-4902-a132-d53bc59d4f63" + }, + "outputs": [], + "source": [ + "!chmod -R 755 ./tools\n", + "!./tools/arangorestore -c none --server.endpoint http+ssl://{con[\"hostname\"]}:{con[\"port\"]} --server.username {con[\"username\"]} --server.database {con[\"dbName\"]} --server.password {con[\"password\"]} --default-replication-factor 3 --input-directory \"data/fraud_dump\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "227hLXnPeG8_" + }, + "source": [ + "# Create Graph" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "howeguvmeG8_" + }, + "source": [ + "The graph we will be using in the following looks as follows:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WqRlqnJCeG8_" + }, + "source": [ + "![networkX](https://github.com/arangoml/networkx-adapter/blob/master/examples/assets/fraud_graph.jpeg?raw=1) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PybHP7jpeG8_", + "outputId": "0c59cdb9-e67c-4e18-9791-69fdb4edbd6a" + }, + "outputs": [], + "source": [ + "from pyArango.collection import Collection, Edges, Field\n", + "from pyArango.graph import Graph, EdgeDefinition\n", + "\n", + "\n", + "class account(Collection):\n", + " _fields = {\n", + " \"Name\": Field()\n", + " }\n", + " \n", + "class customer(Collection):\n", + " _fields = {\n", + " \"Name\": Field()\n", + " }\n", + " \n", + "class transaction(Edges): \n", + " _fields = {\n", + " }\n", + "\n", + "class accountHolder(Edges): \n", + " _fields = {\n", + " }\n", + "\n", + "class FraudDetection(Graph) :\n", + " _edgeDefinitions = [EdgeDefinition(\"accountHolder\", fromCollections=[\"customer\"], toCollections=[\"account\"]),EdgeDefinition(\"transaction\", fromCollections=[\"account\"], toCollections=[\"account\"])]\n", + " _orphanedCollections = []\n", + "\n", + "fraudGraph = db.createGraph(\"FraudDetection\")\n", + "\n", + "print(\"Collection/Graph Setup done.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ANrsn9GreG9A" + }, + "source": [ + "Feel free to visit the ArangoDB UI using the above link and login data and check the Graph!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QfE_tKxneG9A" + }, + "source": [ + "# Connect ArangoDB and NetworkX " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kGfhzPT9eG9A" + }, + "source": [ + "We first connect the ArangoDB_Networkx_Adapter to our temp ArangoDB cluster:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oG496kBeeG9A" + }, + "outputs": [], + "source": [ + "ma = ArangoDB_Networkx_Adapter(conn = con)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uByvwf9feG9A" + }, + "source": [ + "Next, we need to define the attributes in the vertex and edge collections to be included:\n", + "\n", + "*Note, we are currently working on making this step optional in the future!*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UWX9-MsKeG9A" + }, + "outputs": [], + "source": [ + "fraud_detection_attributes = { 'vertexCollections': \n", + " {'account': {'Balance', 'account_type', 'customer_id', 'rank'},\\\n", + " 'bank': {'Country', 'Id', 'bank_id', 'bank_name'},\\\n", + " 'branch':{'City', 'Country', 'Id', 'bank_id', 'branch_id', 'branch_name'},\\\n", + " 'Class':{'concrete', 'label', 'name'},\\\n", + " 'customer': {'Name', 'Sex', 'Ssn', 'rank'}},\\\n", + " 'edgeCollections' : \n", + " {'accountHolder': {'_from', '_to'},\\\n", + " 'Relationship': {'_from', '_to', 'label', 'name', 'relationshipType'},\\\n", + " 'transaction': {'_from', '_to'}}}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5pC59IV-eG9A" + }, + "source": [ + "Now, we can export the networkX graph:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hPp6n66reG9A" + }, + "outputs": [], + "source": [ + "g = ma.create_networkx_graph(graph_name = 'FraudDetection', graph_attributes = fraud_detection_attributes)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gsDza0PBeG9A" + }, + "source": [ + "From here on we can simply use all networkX functionality:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 319 + }, + "id": "cMlIdO9NeG9A", + "outputId": "e41f8e2a-b0d3-4009-f95e-083afff1b213" + }, + "outputs": [], + "source": [ + "nx.draw(g, with_labels=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KJmyw3JAeG9A", + "outputId": "d53d232d-132b-490d-8a3b-67607489b168" + }, + "outputs": [], + "source": [ + "g.nodes()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eNxI-ctteG9A", + "outputId": "edaf5f28-22d8-4586-a43e-d955b48b8940" + }, + "outputs": [], + "source": [ + "g.edges()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k9xyOIQ9eG9A" + }, + "source": [ + "# Next Steps" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ntiBo6X_eG9A" + }, + "source": [ + "* Explore [Interactive NetworkX Notebook](https://colab.research.google.com/github/joerg84/Graph_Powered_ML_Workshop/blob/master/NetworkX.ipynb)\n", + "* Explore [ArangoDB](https://www.arangodb.com/)\n", + "* Explore [Interactive ArangoDB tutorials](https://github.com/joerg84/ArangoDBUniversity#arangodb-university)" + ] + } + ], + "metadata": { + "colab": { + "name": "ArangoDB_NetworkxAdapter.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/examples/IMDB_Networkx_Adapter.ipynb b/examples/IMDB_Networkx_Adapter.ipynb index fc646529..1d8a3eee 100644 --- a/examples/IMDB_Networkx_Adapter.ipynb +++ b/examples/IMDB_Networkx_Adapter.ipynb @@ -1,2147 +1,1385 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "VLfArBaMRTiM" + }, + "source": [ + "# Analysis of IMDB data using the ArangoDB Netoworkx adapter " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wd0_wS6TRTiO" + }, + "source": [ + "This notebook will illustrate how the Networkx adapter can be used to perform graph analytics tasks on graph data stored in ArangoDB. For this example we will use the IMDB movie review dataset. The dataset provides reviews of various movies by viewers. The details of the dataset are available [here](https://www.imdb.com/interfaces/). If we use a relational representation to analyze this problem, then we will need to know what we want to look in the data and then seek it from the data. For example we may be interested in the average rating for a particular genre of movies. The point here is that we need to know the questions that are important in the analysis of this problem apriori. When we use a graph representation of this data, then we can leverage the body of ideas used in network analysis (or social network analysis) to extract insights from the data. The body of ideas are the standard questions we seek answers to when analyzing any network. The answers to these questions will provide us insights about the data and the problem. In other words, we have a framework for extracting insights from the data. In the example provided in this notebook, we will use ideas from the analysis of bipartite graphs to illustrate this point. Of course, if we have a specific question that we are seeking the answer for, then that is available easily from the graph as well. Therefore, going with a graph represenation makes it easier to extract insights from the data.t\n", + "\n", + "The advantages of using a graph representation for the data extend beyond mining and learning from the data. Running adhoc queries for analytic purposes on the data is more efficient with a graph representation. With a relational representation, we can only optimize query performance for queries that are known before hand (with indexes). Adhoc queries could require multiple joins and have poor performance. Most graphs, have the so called _small world effect_. In such graphs, most nodes can be reached from most other nodes with a small number of edge traversals. An adhoc query starting at any node can reach the node of interest in a few hops.\n", + "\n", + "The illustration in this notebook is organized as follows:\n", + "1. Loading the data for the example.\n", + "2. Creating the Networkx graph from the AranogoDB graph using the Networkx Adapter.\n", + "3. Analysis of the graph using Networkx.\n", + "4. Using the Networkx graph to obtain a graph embedding using _Node2vec_.\n", + "5. Using Arangopipe to store metadata about this experiment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZbC-dNSj4c82" + }, + "source": [ + "# Loading the Data into ArangoDB" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Z0BtmHbZ4c82" + }, + "source": [ + "We will use Oasis, ArangoDB's managed service offering, for this example. We will start with installing the required libraries to run this example. We will then connect to Oasis, obtain a connection and create the database.After creating the database, we will load the data into ArangoDB. The next few cells illustrate these steps." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NwlgpvAcRTiP" + }, + "source": [ + "## Install Required Libraries " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZMtTZ5Oth0N-" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!mkdir oasis\n", + "!mkdir IMDBdata\n", + "!git clone -b imdb_with_ratings https://github.com/arangodb/interactive_tutorials IMDBdata\n", + "!git clone -b oasis_connector --single-branch https://github.com/arangodb/interactive_tutorials.git oasis\n", + "!rsync -av oasis/ ./ --exclude=.git\n", + "!chmod -R 755 ./data\n", + "!chmod -R 755 ./tools\n", + "!chmod -R 755 ./oasis\n", + "\n", + "!pip3 install adbnx-adapter==0.0.0.2.5.3\n", + "!pip install arangopipe==0.0.6.9.3\n", + "!pip3 install networkx\n", + "!pip3 install matplotlib\n", + "!pip3 install pyarango\n", + "!pip3 install python-arango\n", + "!pip install node2vec\n", + "!pip install seaborn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BRAQ2O7fRTiX" + }, + "source": [ + "## Get an Oasis Connection" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kTLmib2BRTiY" + }, + "source": [ + "__Oasis__, the managed database service offering from ArangoDB, will be used for this exercise. This eliminates the need for setting up and configuring an instance of a database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "name": "IMDB_Networkx_Adapter.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "8c4c63c2ec584e8694949cde2b878278": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "state": { - "_view_name": "HBoxView", - "_dom_classes": [], - "_model_name": "HBoxModel", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.5.0", - "box_style": "", - "layout": "IPY_MODEL_6276d89c53f1452b82fed9c2af7e0236", - "_model_module": "@jupyter-widgets/controls", - "children": [ - "IPY_MODEL_e7a24632f5fb4670b7fd8945c8e64ce7", - "IPY_MODEL_b9d9e6899f234dc381a947f25edd0b1f" - ] - } - }, - "6276d89c53f1452b82fed9c2af7e0236": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "e7a24632f5fb4670b7fd8945c8e64ce7": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "state": { - "_view_name": "ProgressView", - "style": "IPY_MODEL_7d125a00ff3046aab6c2dbd1b824ee5f", - "_dom_classes": [], - "description": "Computing transition probabilities: 100%", - "_model_name": "FloatProgressModel", - "bar_style": "success", - "max": 914, - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": 914, - "_view_count": null, - "_view_module_version": "1.5.0", - "orientation": "horizontal", - "min": 0, - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_a61b9cb46d724bf8bdb24de5a9592017" - } - }, - "b9d9e6899f234dc381a947f25edd0b1f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "state": { - "_view_name": "HTMLView", - "style": "IPY_MODEL_9d1e1bb9a456457a87bb0dcb440f76dc", - "_dom_classes": [], - "description": "", - "_model_name": "HTMLModel", - "placeholder": "​", - "_view_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "value": " 914/914 [00:33<00:00, 27.24it/s]", - "_view_count": null, - "_view_module_version": "1.5.0", - "description_tooltip": null, - "_model_module": "@jupyter-widgets/controls", - "layout": "IPY_MODEL_914e350efb3e4f4fbb1baf94fb4134a4" - } - }, - "7d125a00ff3046aab6c2dbd1b824ee5f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "state": { - "_view_name": "StyleView", - "_model_name": "ProgressStyleModel", - "description_width": "initial", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "bar_color": null, - "_model_module": "@jupyter-widgets/controls" - } - }, - "a61b9cb46d724bf8bdb24de5a9592017": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - }, - "9d1e1bb9a456457a87bb0dcb440f76dc": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "state": { - "_view_name": "StyleView", - "_model_name": "DescriptionStyleModel", - "description_width": "", - "_view_module": "@jupyter-widgets/base", - "_model_module_version": "1.5.0", - "_view_count": null, - "_view_module_version": "1.2.0", - "_model_module": "@jupyter-widgets/controls" - } - }, - "914e350efb3e4f4fbb1baf94fb4134a4": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "state": { - "_view_name": "LayoutView", - "grid_template_rows": null, - "right": null, - "justify_content": null, - "_view_module": "@jupyter-widgets/base", - "overflow": null, - "_model_module_version": "1.2.0", - "_view_count": null, - "flex_flow": null, - "width": null, - "min_width": null, - "border": null, - "align_items": null, - "bottom": null, - "_model_module": "@jupyter-widgets/base", - "top": null, - "grid_column": null, - "overflow_y": null, - "overflow_x": null, - "grid_auto_flow": null, - "grid_area": null, - "grid_template_columns": null, - "flex": null, - "_model_name": "LayoutModel", - "justify_items": null, - "grid_row": null, - "max_height": null, - "align_content": null, - "visibility": null, - "align_self": null, - "height": null, - "min_height": null, - "padding": null, - "grid_auto_rows": null, - "grid_gap": null, - "max_width": null, - "order": null, - "_view_module_version": "1.2.0", - "grid_template_areas": null, - "object_position": null, - "object_fit": null, - "grid_auto_columns": null, - "margin": null, - "display": null, - "left": null - } - } - } - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "VLfArBaMRTiM" - }, - "source": [ - "# Analysis of IMDB data using the ArangoDB Netoworkx adapter " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Wd0_wS6TRTiO" - }, - "source": [ - "This notebook will illustrate how the Networkx adapter can be used to perform graph analytics tasks on graph data stored in ArangoDB. For this example we will use the IMDB movie review dataset. The dataset provides reviews of various movies by viewers. The details of the dataset are available [here](https://www.imdb.com/interfaces/). If we use a relational representation to analyze this problem, then we will need to know what we want to look in the data and then seek it from the data. For example we may be interested in the average rating for a particular genre of movies. The point here is that we need to know the questions that are important in the analysis of this problem apriori. When we use a graph representation of this data, then we can leverage the body of ideas used in network analysis (or social network analysis) to extract insights from the data. The body of ideas are the standard questions we seek answers to when analyzing any network. The answers to these questions will provide us insights about the data and the problem. In other words, we have a framework for extracting insights from the data. In the example provided in this notebook, we will use ideas from the analysis of bipartite graphs to illustrate this point. Of course, if we have a specific question that we are seeking the answer for, then that is available easily from the graph as well. Therefore, going with a graph represenation makes it easier to extract insights from the data.t\n", - "\n", - "The advantages of using a graph representation for the data extend beyond mining and learning from the data. Running adhoc queries for analytic purposes on the data is more efficient with a graph representation. With a relational representation, we can only optimize query performance for queries that are known before hand (with indexes). Adhoc queries could require multiple joins and have poor performance. Most graphs, have the so called _small world effect_. In such graphs, most nodes can be reached from most other nodes with a small number of edge traversals. An adhoc query starting at any node can reach the node of interest in a few hops.\n", - "\n", - "The illustration in this notebook is organized as follows:\n", - "1. Loading the data for the example.\n", - "2. Creating the Networkx graph from the AranogoDB graph using the Networkx Adapter.\n", - "3. Analysis of the graph using Networkx.\n", - "4. Using the Networkx graph to obtain a graph embedding using _Node2vec_.\n", - "5. Using Arangopipe to store metadata about this experiment." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZbC-dNSj4c82" - }, - "source": [ - "# Loading the Data into ArangoDB" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Z0BtmHbZ4c82" - }, - "source": [ - "We will use Oasis, ArangoDB's managed service offering, for this example. We will start with installing the required libraries to run this example. We will then connect to Oasis, obtain a connection and create the database.After creating the database, we will load the data into ArangoDB. The next few cells illustrate these steps." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "NwlgpvAcRTiP" - }, - "source": [ - "## Install Required Libraries " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ZMtTZ5Oth0N-" - }, - "source": [ - "%%capture\n", - "!mkdir oasis\n", - "!mkdir IMDBdata\n", - "!git clone -b imdb_with_ratings https://github.com/arangodb/interactive_tutorials IMDBdata\n", - "!git clone -b oasis_connector https://github.com/arangodb/interactive_tutorials oasis\n", - "!rsync -av oasis/ ./ --exclude=.git\n", - "!chmod -R 755 ./data\n", - "!chmod -R 755 ./tools\n", - "!chmod -R 755 ./oasis\n", - "\n", - "!pip3 install networkx\n", - "!pip3 install matplotlib\n", - "!pip3 install --index-url https://test.pypi.org/simple/ adbnx-adapter==0.0.0.2.5.3\n", - "!pip3 install pyarango\n", - "!pip3 install python-arango\n", - "!pip install node2vec\n", - "!pip install seaborn\n", - "!pip install arangopipe==0.0.6.9.3" - ], - "execution_count": 93, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BRAQ2O7fRTiX" - }, - "source": [ - "## Get a Oasis Connection" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kTLmib2BRTiY" - }, - "source": [ - "__Oasis__, the managed database service offering from ArangoDB, will be used for this exercise. This eliminates the need for setting up and configuring an instance of a database." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9voIoaGRS0cB", - "outputId": "441a4cd7-6eb6-4664-d97e-fc625fdb6c6f" - }, - "source": [ - "from adbnx_adapter.imdb_arangoDB_networkx_adapter import IMDBArangoDB_Networkx_Adapter\n", - "import oasis\n", - "con = oasis.getTempCredentials()\n", - "\n", - "print()\n", - "print(\"https://{}:{}\".format(con[\"hostname\"], con[\"port\"]))\n", - "print(\"Username: \" + con[\"username\"])\n", - "print(\"Password: \" + con[\"password\"])\n", - "print(\"Database: \" + con[\"dbName\"])\n", - "\n", - "\n", - "ma = IMDBArangoDB_Networkx_Adapter(conn=con)" - ], - "execution_count": 133, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Reusing cached credentials.\n", - "\n", - "https://d383fa0b596a.arangodb.cloud:8529\n", - "Username: TUTavlkt1fv7jsujf5ai5qjcl\n", - "Password: TUTn5bwjhjf4rg2rzkub9hnvu\n", - "Database: TUT276o8okjtpdqehucmszbfs\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "APNXpksoRTif" - }, - "source": [ - "## Create the Collections for the Database " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "RQAoZi3AW9ru" - }, - "source": [ - "# Connect to the temp database\n", - "conn = oasis.connect(con)\n", - "db = conn[con[\"dbName\"]]" - ], - "execution_count": 134, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "f0nSSiRnRTit" - }, - "source": [ - "## Import the Data " - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "sE_jzJHfASIR", - "outputId": "c5dd5bf3-447d-4e6a-f3d7-74ae707bd07b" - }, - "source": [ - "! ./tools/arangorestore -c none --server.endpoint http+ssl://{con[\"hostname\"]}:{con[\"port\"]} --server.username {con[\"username\"]} --server.database {con[\"dbName\"]} --server.password {con[\"password\"]} --default-replication-factor 3 --input-directory \"./IMDBdata/data/imdb_with_ratings\"" - ], - "execution_count": 135, - "outputs": [ - { - "output_type": "stream", - "text": [ - "\u001b[0m2021-03-11T21:14:40Z [996] INFO [05c30] {restore} Connected to ArangoDB 'http+ssl://d383fa0b596a.arangodb.cloud:8529'\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:41Z [996] INFO [abeb4] {restore} Database name in source dump is 'TUTdit9ohpgz1ntnbetsjstwi'\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:41Z [996] INFO [9b414] {restore} # Re-creating document collection 'Movies'...\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:42Z [996] INFO [9b414] {restore} # Re-creating document collection 'Users'...\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:42Z [996] INFO [9b414] {restore} # Re-creating edge collection 'Ratings'...\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:43Z [996] INFO [6d69f] {restore} # Dispatched 3 job(s), using 2 worker(s)\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:43Z [996] INFO [94913] {restore} # Loading data into document collection 'Users', data size: 16717 byte(s)\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:43Z [996] INFO [94913] {restore} # Loading data into document collection 'Movies', data size: 68107 byte(s)\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:43Z [996] INFO [6ae09] {restore} # Successfully restored document collection 'Users'\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:43Z [996] INFO [94913] {restore} # Loading data into edge collection 'Ratings', data size: 1407601 byte(s)\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:43Z [996] INFO [6ae09] {restore} # Successfully restored document collection 'Movies'\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:48Z [996] INFO [75e65] {restore} # Current restore progress: restored 2 of 3 collection(s), read 9270558 byte(s) from datafiles, sent 3 data batch(es) of 881948 byte(s) total size, queued jobs: 0, workers: 2\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:52Z [996] INFO [69a73] {restore} # Still loading data into edge collection 'Ratings', 10660073 byte(s) restored\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:52Z [996] INFO [6ae09] {restore} # Successfully restored edge collection 'Ratings'\n", - "\u001b[0m\u001b[0m2021-03-11T21:14:52Z [996] INFO [a66e1] {restore} Processed 3 collection(s) in 12.377135 s, read 11542023 byte(s) from datafiles, sent 4 data batch(es) of 11542020 byte(s) total size\n", - "\u001b[0m" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tDmWTWYkW6VW", - "outputId": "490fb28d-d136-43f2-e067-dd540744c4bc" - }, - "source": [ - "import csv\n", - "import json\n", - "import requests\n", - "import sys\n", - "import oasis\n", - "from pyArango.connection import *\n", - "from pyArango.collection import Collection, Edges, Field\n", - "from pyArango.graph import Graph, EdgeDefinition\n", - "\n", - "class Users(Collection):\n", - " _fields = {\n", - " \"user_id\": Field(),\n", - " \"age\": Field(),\n", - " \"gender\": Field()\n", - " }\n", - " \n", - "class Movies(Collection):\n", - " _fields = {\n", - " \"movie_id\": Field(),\n", - " \"movie_title\": Field(),\n", - " \"release_data\": Field()\n", - " }\n", - "\n", - "class Ratings(Edges): \n", - " _fields = {\n", - " #user_id and movie_id (_key of movie) are encoded by _from, _to \n", - " \"rating\": Field(),\n", - " \"timestamp\": Field()\n", - " }\n", - "\n", - "class IMDBGraph(Graph):\n", - " _edgeDefinitions = [EdgeDefinition(\"Ratings\", fromCollections=[\"Users\"], toCollections=[\"Movies\"])]\n", - " _orphanedCollections = []\n", - "\n", - "iMDBGraph = db.createGraph(\"IMDBGraph\", replicationFactor=3)\n", - "\n", - "print(\"Collection/Graph Setup done.\")" - ], - "execution_count": 137, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Collection/Graph Setup done.\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5rm-ZpNQ4c87" - }, - "source": [ - "# Use the Networkx-Adapter to create a Networkx Graph" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i0CYXcbU4c87" - }, - "source": [ - "Now that we have loaded the data into ArangoDB, we can use the __Networkx-Adapter__ to create _Networkx_ graphs from the ArangoDB graph. To do so, we'll have to provide a graph descriptor that describes the graph structure to the __Networkx-Adapter__. These steps are illustrated below. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tua4CIHYRTi0" - }, - "source": [ - "## Specify the Graph Structure " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9oKkkRxwRTi0" - }, - "source": [ - "To use the IMDB Networkx Adapter, we need to specify the structure of the graph that we want to create. This is done with a simple dictionary. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wobwe8KqXXi2" - }, - "source": [ - "imdb_attributes = {'vertexCollections': {'Users': {},\n", - " 'Movies': {}},\n", - " 'edgeCollections': {'Ratings': {'_from', '_to', 'ratings'}}}" - ], - "execution_count": 138, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ORqpcKl_4c87" - }, - "source": [ - "## Obtain the networkx graph" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "jjJZJEgvXZ0W" - }, - "source": [ - "g = ma.create_networkx_graph(\n", - " graph_name='IMDBGraph', graph_attributes=imdb_attributes)" - ], - "execution_count": 139, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Rk-La15l4c88" - }, - "source": [ - "__Done!__, we now have a _Networkx_ graph representation that we can use for analysis!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a3dOVYck4c88" - }, - "source": [ - "# Analysis of the IMDB reviews dataset with Networkx\n", - "We just created a networkx graph for this problem. In this notebook we will use a small set of ideas from the standard body of ideas used in network analysis to extract insights from the movie review dataset. Networkx provides an implementation of these ideas. These will be discussed next.\n", - "\n", - "As mentioned earlier, one of the advantages of using a graph representation is that we can leverage the standard body of ideas used to analyze networks to extract information about this dataset. In what follows, we will call out such facts as we identify them.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Zj_ZCYrt4c88" - }, - "source": [ - "## Get the user and movie nodes" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "yqmOgDuN4c88" - }, - "source": [ - "user_nodes = [n for n in g.nodes() if n.startswith(\"Users\")]\n", - "movie_nodes = [n for n in g.nodes() if n.startswith(\"Movies\")]" - ], - "execution_count": 140, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZMA_ffth4c89" - }, - "source": [ - "## Structural Property Introspection: Number of Nodes and Edges" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1wjglzLW4c89", - "outputId": "8c39721f-3bfe-422a-a0ac-8275d037c456" - }, - "source": [ - "print(\"Number of Users are %d\" % (len(user_nodes)))\n", - "print(\"Number of Movies are %d\" % (len(movie_nodes)))\n", - "print(\"Number of Ratings are %d\" % (len(list(g.edges()))))" - ], - "execution_count": 141, - "outputs": [ - { - "output_type": "stream", - "text": [ - "Number of Users are 943\n", - "Number of Movies are 1682\n", - "Number of Ratings are 65499\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7YAkup5g4c89" - }, - "source": [ - "## Convert the graph obtained from the interface to a bi-partite graph " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Le8HMfU-4c89" - }, - "source": [ - "import networkx as nx\n", - "B = nx.Graph()\n", - "B.add_nodes_from(user_nodes, bipartite=0)\n", - "B.add_nodes_from(movie_nodes, bipartite=1)\n", - "B.add_edges_from(list(g.edges()))" - ], - "execution_count": 142, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZaBJqyZy4c89" - }, - "source": [ - "### Identified fact:\n", - "_The graph has two kinds of edges. It has 943 users and and 1682 Movies. A user may watch multiple movies. 65499 movie ratings are available. This information is obtained by simply identifying the structure of the graph._" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ybp1eslr4c89", - "outputId": "9e307259-52a8-4172-8725-6d7cde71ea80" - }, - "source": [ - "from networkx.algorithms import bipartite\n", - "bipartite.is_bipartite(B)" - ], - "execution_count": 143, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "True" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 143 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RcV_-8Dj4c89" - }, - "source": [ - "## Is the graph connected?\n", - "If the graph is connected then there is a path between any two nodes in the graph. If not, then some nodes are not connected. In the context of this application, a disconnected graph implies that there exist users in the graph who may not have rated a movie in common. If it is true that any given pair of users have rated at least one movie in common, then there is a path connecting these users through the movie they have rated in common." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KIYLmAtj4c8-" - }, - "source": [ - "## Identified fact:\n", - "_In the user community, if we pick any two users, then it is possible that they may not have rated a movie in common._" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Dr749ocn4c8-", - "outputId": "e4e91970-7275-4e6c-d1b6-589b7cb79039" - }, - "source": [ - "nx.is_connected(B)" - ], - "execution_count": 144, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "False" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 144 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1nNjYdTp4c8-" - }, - "source": [ - "users = [n for n in B.nodes if B.nodes[n]['bipartite'] == 0]\n", - "movies = [n for n in B.nodes if B.nodes[n]['bipartite'] == 1]" - ], - "execution_count": 145, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pWP1Tm0Z4c8-" - }, - "source": [ - "## How many movies does a User rate?\n", - "This is a distribution. A review of the plot below will show the familiar long tailed distribution. Most viewers rate less than say 100 movies. There are some serious reviewers out there though. These are on the tail of the distribution." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "nW4ztdKR4c8-" - }, - "source": [ - "degu, degm = bipartite.degrees(B, movies)" - ], - "execution_count": 146, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "xEkEriFa4c8-" - }, - "source": [ - "du = [v for k,v in degu]" - ], - "execution_count": 147, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 355 - }, - "id": "iMlIdvIp4c8-", - "outputId": "40bdc0d7-afe6-4d22-b26c-a9d7b305c696" - }, - "source": [ - "import matplotlib.pyplot as plt\n", - "plt.rcParams['figure.figsize'] = [10, 5]\n", - "ax = plt.hist(du)\n", - "plt.title(\"Histogram of the Number of Movies Viewed\")\n", - "plt.xlabel(\"Movies Viewed\")\n", - "plt.grid(True)" - ], - "execution_count": 148, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9ePYKt-a4c8-" - }, - "source": [ - "### Identified fact:\n", - "_Inspecting the degree distribution of vertices is an activity we commonly perform to understand some basic characteristics of the network. In this application, this activity shows that most user's rate less than a 100 movies. The graph above is an example of \"long tailed distribution\". This is commonly seen in social networks._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TLGsHDx24c8-" - }, - "source": [ - "## How many users rate a movie?\n", - "This is also a distribution that also has the long tailed behavior. Most mvoies are rated by less than 100 users. There are some movies that are rated by many viewers though." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 355 - }, - "id": "zCdf0HX34c8_", - "outputId": "0b0a31eb-bac7-400f-da72-b72020f0bcaa" - }, - "source": [ - "dm = [v for k,v in degm]\n", - "plt.rcParams['figure.figsize'] = [10, 5]\n", - "ax = plt.hist(dm)\n", - "plt.title(\"Histogram of the Number of Viewers Rating a Movie\")\n", - "plt.xlabel(\"Number of Viewers\")\n", - "plt.grid(True)" - ], - "execution_count": 149, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "43HtTUpU4c8_" - }, - "source": [ - "### Identified fact:\n", - "_We can repeat the exploration of degree distribution with the movie nodes. The behavior observed with the movie nodes is similar to what is observed with the user nodes. We observe the same \"long tailed\" distribution and most movies are rated by less than a 100 users._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mNf2j7U14c8_" - }, - "source": [ - "## Centrality Measures\n", - "\n", - "A centrality measure identifies influential nodes in a network. How do we formalize the observations about movie importance and the user's rating behavior in the given network? The notion of centrality measure is useful in this regard. Many measures of node centrality are used, see [Chapter 3 of the text book by Zafarani et.al](http://dmml.asu.edu/smm/) for details. We will use degree centraility in this example (see the [documentation](https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.centrality.degree_centrality.html) for details). In the illustrations below, we will capture the top 10 users and movies in terms of importance as determined by this measure. " - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "BsHe6G_N4c8_" - }, - "source": [ - "dc= bipartite.degree_centrality(B, users)\n", - "sdcu = {}\n", - "sdcm = {}\n", - "for k, v in sorted(dc.items(),reverse=True, key=lambda item: item[1]):\n", - " if k.startswith(\"Users\"):\n", - " sdcu[k] = v\n", - " else:\n", - " sdcm[k] = v\n", - "\n", - "del dc" - ], - "execution_count": 150, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Uf4MZBHe4c8_" - }, - "source": [ - "## List the top 10 users in terms of degree centrality\n", - "These guys rate a lot of movies" - ] - }, - { - "cell_type": "code", - "metadata": { - "scrolled": true, - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "N64DiO3H4c8_", - "outputId": "3aba9db1-4cb3-4350-fe55-993786eb8f3d" - }, - "source": [ - "list(sdcu.keys())[:10]" - ], - "execution_count": 151, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['Users/405',\n", - " 'Users/13',\n", - " 'Users/276',\n", - " 'Users/450',\n", - " 'Users/234',\n", - " 'Users/303',\n", - " 'Users/416',\n", - " 'Users/655',\n", - " 'Users/181',\n", - " 'Users/393']" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 151 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "caKNxjrP4c8_" - }, - "source": [ - "## List the top 10 movies in terms of degree centrality\n", - "These movies are rated by a lot of people" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Gswkyc5f4c8_", - "outputId": "af141f85-b4b4-4b24-c098-2bdfb88da4f0" - }, - "source": [ - "list(sdcm.keys())[:10]" - ], - "execution_count": 152, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['Movies/50',\n", - " 'Movies/258',\n", - " 'Movies/100',\n", - " 'Movies/288',\n", - " 'Movies/181',\n", - " 'Movies/286',\n", - " 'Movies/294',\n", - " 'Movies/1',\n", - " 'Movies/56',\n", - " 'Movies/98']" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 152 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CeHGqYwX4c8_" - }, - "source": [ - "## How does this importance measure vary over the user community?\n", - "We can get a sense of this by viewing the distribution of this measure." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 604 - }, - "id": "0pOqgjAp4c9A", - "outputId": "a3abe1e1-e275-46fa-85a0-a1b64d0ec7a6" - }, - "source": [ - "import seaborn as sns\n", - "import numpy as np\n", - "npvals = np.fromiter(sdcu.values(), dtype=float)\n", - "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", - "ax = sns.distplot(npvals, hist = False)\n", - "plt.title(\"Distribution of User Degree Centrality\")\n", - "plt.xlabel(\"User Degree Centraility\")\n" - ], - "execution_count": 153, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n", - " warnings.warn(msg, FutureWarning)\n" - ], - "name": "stderr" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Text(0.5, 0, 'User Degree Centraility')" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 153 - }, - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kbzu3I_q4c9A" - }, - "source": [ - "## How does this importance measure vary over the movie community?" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 604 - }, - "id": "LQ28qv9y4c9A", - "outputId": "e0b35ddc-0ce8-4ef6-ff11-b5ec7d2115db" - }, - "source": [ - "import seaborn as sns\n", - "import numpy as np\n", - "npvals = np.fromiter(sdcm.values(), dtype=float)\n", - "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", - "ax = sns.distplot(npvals, hist = False)\n", - "plt.title(\"Distribution of Movie Degree Centrality\")\n", - "plt.xlabel(\"Movie Degree Centraility\")\n" - ], - "execution_count": 154, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n", - " warnings.warn(msg, FutureWarning)\n" - ], - "name": "stderr" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Text(0.5, 0, 'Movie Degree Centraility')" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 154 - }, - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "z1GwPZIg4c9A" - }, - "source": [ - "### Identified fact:\n", - "_We have a measure to capture the importance of a movie and a user on the basis of the rating activity performed on them. We have identified important users and movies in the network on the basis of this measure. We have also have information about how this importance measure varies over the users and movies in the data._" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "x1vtgYbAXpec" - }, - "source": [ - "m4v = [t[0] for t in g.in_edges('Movies/4')]" - ], - "execution_count": 155, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sO21vdXv4c9A" - }, - "source": [ - "## Similarity of nodes\n", - "Just like measures exist to quantify the importance of nodes in a network, there exist measures to quantify the similarity of nodes in a network. We will pick a random node in the network and characterize the similarity of nodes connected to it." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y4W1MtL-RTjK" - }, - "source": [ - "## How similar are viewers of the movie 'Get Shorty'?\n", - "In this example, we will use the __Jaccard__ similarity as a measure of node similarity. We first get all pairs of users who have seen the movie and then compute the __Jaccard__ similarity between them. The details are shown below. Networkx provides an API for this purpose that we will use." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "z4WoMVchXtVq" - }, - "source": [ - "from itertools import combinations\n", - "m4vucmb = list(combinations(m4v, 2))" - ], - "execution_count": 156, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "3Z4VBUvkXyHl" - }, - "source": [ - "import networkx as nx\n", - "gp = g.to_undirected()\n", - "jcp = nx.jaccard_coefficient(gp, m4vucmb)" - ], - "execution_count": 157, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "f2i7Puh9X14N" - }, - "source": [ - "jcpv = []\n", - "for u, v, p in jcp:\n", - " jcpv.append(p)\n", - " #print('(%s, %s) -> %.8f' % (u, v, p))" - ], - "execution_count": 158, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-L1O0rhe4c9B" - }, - "source": [ - "## What does the distribution of User Similarity look like?" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 604 - }, - "id": "ANP7CmNj4c9B", - "outputId": "0d78641c-b03c-48ff-e3a8-76a14d6719cf" - }, - "source": [ - "import seaborn as sns\n", - "import numpy as np\n", - "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", - "ax = sns.distplot(jcpv, hist = False)\n", - "plt.title(\"Distribution of Jaccard Similarity between Raters of Get Shorty\")\n", - "plt.xlabel(\"Jaccard Similarity\")" - ], - "execution_count": 159, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n", - " warnings.warn(msg, FutureWarning)\n" - ], - "name": "stderr" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Text(0.5, 0, 'Jaccard Similarity')" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 159 - }, - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dRNe85k54c9B" - }, - "source": [ - "### Identified fact:\n", - "_Picking a random node in the graph, we characterized the similarity of nodes connected to it. In this dataset, we can use this idea to get a sense of how similar user's rating a movie tend to be. A review of the above figure shows that a range of similarities are observed._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1SJKyWDe4c9B" - }, - "source": [ - "## Bipartite Clustering\n", - "The next idea that we will investigate on the graph comes from [Latapy et.al](https://arxiv.org/abs/cond-mat/0611631). In this work, the authors extend the notion of clustering coefficients associated with a node to the case of bi-partite graphs. Since our graph is a bipartite graph, we will use this metric. See section VIII of the paper for a detailed explanation of the idea. Briefly, the clustering coefficient for a pair of nodes $u, v$, is defined as follows:\n", - "$$\n", - "\\begin{equation*}\n", - "cc(u,v) = \\frac{N(u)\\cap N(v)}{N(u) \\cup N(v)},\n", - "\\end{equation*}\n", - "$$\n", - "where:\n", - "\n", - "* N(u) is the neighborhood of node $u$. These are the nodes connected to $u$. For example, the movies user $u$ rates.\n", - "* N(v) is the neighborhood of node $v$.\n", - "Reviewing the above equation, it is evident that the clustering coefficient captures the shared neighborhood for a pair of nodes. Nodes associated with high clustering coefficient have many common neighbors. In the context of this problem, a pair of users with a high clustering coefficient suggests that these users have rated many movies in common. Similarly, a pair of movies with a high clustering coefficient suggests that these movies there are many common raters(users) for these movies. Networkx provides an implementation of this idea. The details of computing the clustering coefficients for the users and movies are shown below.\n", - "To obtain the clustering coefficient for a node $u$, we simply average the clustering coefficient of $u$ with other nodes that it shares common neighbors with. It is defined as follows:\n", - "\n", - "$$\n", - "\\begin{equation*}\n", - "cc(u) = \\frac{\\sum_{v \\in N(u)} cc(u,v)}{\\left| N(N(u) \\right|},\n", - "\\end{equation*}\n", - "$$\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "u7o7oLkC4c9B" - }, - "source": [ - "cr = bipartite.clustering(B)\n", - "cu = {}\n", - "cm = {}\n", - "for k, v in sorted(cr.items(),reverse=True, key=lambda item: item[1]):\n", - " if k.startswith(\"Users\"):\n", - " cu[k] = v\n", - " else:\n", - " cm[k] = v\n", - "\n", - "del cr" - ], - "execution_count": 160, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ATmq_-aK4c9B" - }, - "source": [ - "## List the top 10 users in terms of clustering tendencies \n", - "These users share a high degree of common movie rating activity with other users. If rating is used as a proxy for the act of liking or disliking a movie, then user's with high clustering values share rating activity with " - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "HX9colNZ4c9B", - "outputId": "660accba-5da0-47ba-b479-17667dc94eb9" - }, - "source": [ - "list(cu.keys())[:10]" - ], - "execution_count": 161, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['Users/72',\n", - " 'Users/297',\n", - " 'Users/64',\n", - " 'Users/178',\n", - " 'Users/533',\n", - " 'Users/493',\n", - " 'Users/347',\n", - " 'Users/301',\n", - " 'Users/198',\n", - " 'Users/249']" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 161 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gE83U_Am4c9B" - }, - "source": [ - "## List the top 10 movies in terms of clustering tendencies \n", - "These movies have a high degree of common users rating them." - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Av0ToBrp4c9C", - "outputId": "c97d33bd-1940-40e6-da2e-f2386753712b" - }, - "source": [ - "list(cm.keys())[:10]" - ], - "execution_count": 162, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['Movies/1663',\n", - " 'Movies/1643',\n", - " 'Movies/1669',\n", - " 'Movies/1652',\n", - " 'Movies/1373',\n", - " 'Movies/1364',\n", - " 'Movies/1363',\n", - " 'Movies/1354',\n", - " 'Movies/1349',\n", - " 'Movies/1348']" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 162 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BgGI3bv04c9C" - }, - "source": [ - "## How does user clustering tendency vary over the community?" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 604 - }, - "id": "JNGtK0z84c9C", - "outputId": "46a87af3-a70f-4613-fc00-a18290bde79e" - }, - "source": [ - "import seaborn as sns\n", - "import numpy as np\n", - "npvals = np.fromiter(cu.values(), dtype=float)\n", - "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", - "ax = sns.distplot(npvals, hist = False)\n", - "plt.title(\"Distribution of User Clustering Coefficients\")\n", - "plt.xlabel(\"User Clustering Coefficients\")\n" - ], - "execution_count": 163, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n", - " warnings.warn(msg, FutureWarning)\n" - ], - "name": "stderr" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Text(0.5, 0, 'User Clustering Coefficients')" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 163 - }, - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EjfGafPm4c9C" - }, - "source": [ - "### Note:\n", - "Note that the distribution of user clustering coefficients has two humps (bi-modal). This suggests that there are two user communities. This is borne out when we visualize a sample of the user community using a _tSNE_ plot later in this notebook." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SPxiPAjm4c9C" - }, - "source": [ - "## How does movie clustering tendency vary over the community?" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 604 - }, - "id": "enjjfIib4c9C", - "outputId": "bf997c79-d536-42e9-dd90-1263cfea1a0b" - }, - "source": [ - "import seaborn as sns\n", - "import numpy as np\n", - "npvals = np.fromiter(cm.values(), dtype=float)\n", - "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", - "ax = sns.distplot(npvals, hist = False)\n", - "plt.title(\"Distribution of Movie Clustering Coefficients\")\n", - "plt.xlabel(\"Movie Clustering Coefficients\")" - ], - "execution_count": 164, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).\n", - " warnings.warn(msg, FutureWarning)\n" - ], - "name": "stderr" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Text(0.5, 0, 'Movie Clustering Coefficients')" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 164 - }, - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0ICweEaZ4c9C" - }, - "source": [ - "### Note:\n", - "Note that the distribution of user clustering coefficients has three humps. This suggests that there are three movie clusters. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b26SbNVt4c9C" - }, - "source": [ - "### Identified fact:\n", - "_We have identified users and movies that tend to have have a lot of common rating activity. We have identified that users and movies have clustering tendencies._" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hJmuEFFx4c9C" - }, - "source": [ - "# Using Node2vec to obtain a graph embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Nc-VtRGO4c9D" - }, - "source": [ - "Machine learning techniques can be applied on graphs to determine vector representations of graph elements such as nodes and edges. Such a representation is called an _embedding_. We can use the Networkx representation of the graph associated with the 10 most clustered users and determine its embedding. We can use a technique called [_t Stochastic Neighbor Embedding_](https://www.youtube.com/watch?v=RJVL80Gg3lA&list=UUtXKDgv1AVoG88PLl8nGXmw) to plot a two dimensional representation of this subgraph. The details of doing this is provided below. Earlier we noted that the user clustering tendencies were bi-modal. A review of the embedding of the sub-graph associated with the 10 most clustered users shows that there are two clusters in the data." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kHWs_A1X4c9D" - }, - "source": [ - "## Extract the sub-graph associated with the 10 most clustered users" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "IoGJKmJm4c9D" - }, - "source": [ - "t10cu = list(cu.keys())[:10]\n", - "proj_user = nx.bipartite.projected_graph(B, t10cu)" - ], - "execution_count": 165, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qyZwFHnL4c9D", - "outputId": "2d8eaf1e-b445-430e-e675-6eb261884b11" - }, - "source": [ - "len(list(proj_user.edges()))" - ], - "execution_count": 166, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "8796" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 166 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "tLgoFfvf4c9D", - "outputId": "68dd10d8-d483-4ed8-af2c-cca5afdb845c" - }, - "source": [ - "len(proj_user.nodes())" - ], - "execution_count": 167, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "914" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 167 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Jv5XKBLfRTjd" - }, - "source": [ - "## Embed the sub-graph using Node2vec " - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 66, - "referenced_widgets": [ - "8c4c63c2ec584e8694949cde2b878278", - "6276d89c53f1452b82fed9c2af7e0236", - "e7a24632f5fb4670b7fd8945c8e64ce7", - "b9d9e6899f234dc381a947f25edd0b1f", - "7d125a00ff3046aab6c2dbd1b824ee5f", - "a61b9cb46d724bf8bdb24de5a9592017", - "9d1e1bb9a456457a87bb0dcb440f76dc", - "914e350efb3e4f4fbb1baf94fb4134a4" - ] - }, - "id": "e-fD-eNhX9Lh", - "outputId": "a2ffbdc5-860e-4e04-8e92-59ea7d3ff1e6" - }, - "source": [ - "from node2vec import Node2Vec\n", - "node2vec = Node2Vec(proj_user, dimensions=64, walk_length=10, num_walks=100, workers=4)" - ], - "execution_count": 168, - "outputs": [ - { - "output_type": "display_data", - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "8c4c63c2ec584e8694949cde2b878278", - "version_minor": 0, - "version_major": 2 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Computing transition probabilities', max=914.0, style=Pro…" - ] - }, - "metadata": { - "tags": [] - } - }, - { - "output_type": "stream", - "text": [ - "\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6hu6Z0VPYDFK" - }, - "source": [ - "model = node2vec.fit(window=10, min_count=1, batch_words=4)" - ], - "execution_count": 169, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 35 - }, - "id": "kUN1zLMG4c9D", - "outputId": "fd0aaad1-155c-45a2-d6b8-6f48c14920d0" - }, - "source": [ - "t10cu[2]" - ], - "execution_count": 170, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'Users/64'" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 170 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0NMFl0dD4c9D" - }, - "source": [ - "## Apply the model\n", - "We can ask questions like \"Who is most similar to (a particular user) Users/64?\"" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "41YVYl3jYHia", - "outputId": "02c9036c-cf25-4018-c742-ca595aa91b83" - }, - "source": [ - "model.wv.most_similar(t10cu[2])" - ], - "execution_count": 171, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[('Users/249', 0.8666249513626099),\n", - " ('Users/178', 0.8422333598136902),\n", - " ('Users/493', 0.832413911819458),\n", - " ('Users/347', 0.8107874393463135),\n", - " ('Users/301', 0.8077564835548401),\n", - " ('Users/297', 0.7933509349822998),\n", - " ('Users/533', 0.7722172737121582),\n", - " ('Users/896', 0.7490148544311523),\n", - " ('Users/598', 0.7475563287734985),\n", - " ('Users/483', 0.734812319278717)]" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 171 - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SEgDZqYb4c9E" - }, - "source": [ - "## Embed the graph with tSNE" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Ojg_VZVP4c9E" - }, - "source": [ - "t10cu_emb = { n: list(model.wv.get_vector(n)) for n in proj_user.nodes()}" - ], - "execution_count": 172, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 604 - }, - "id": "joYmsa3X4c9E", - "outputId": "a0e839c9-914d-4e0b-8499-37cb669f1e04" - }, - "source": [ - "import numpy as np\n", - "from matplotlib import cm\n", - "from sklearn.manifold import TSNE\n", - "fig = plt.figure(figsize=(11.7,8.27))\n", - "plt.grid(True)\n", - "plt.xlabel(\"Comp 1\")\n", - "plt.ylabel(\"Comp 2\")\n", - "plt.title(\"tSNE plot of subgraph of top 10 most clustered users\")\n", - "t10cu_embedded = TSNE(n_components=2).fit_transform(list(t10cu_emb.values()))\n", - "sns.scatterplot(t10cu_embedded[:,0], t10cu_embedded[:,1])" - ], - "execution_count": 173, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.\n", - " FutureWarning\n" - ], - "name": "stderr" - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 173 - }, - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "tags": [] - } - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "97aU0nWw4c9E" - }, - "source": [ - "__Using the Networkx adapter we have been able to perform a range of analysis tasks using the Networkx API. We were also able use the Networkx graph to obtain an embedding of a graph using Node2vec.__" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dA_np4UI4c9E" - }, - "source": [ - "## Store Embeddings in ArangoDB" - ] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "uzMUv9Ol4c9E", - "outputId": "6a171f56-f70c-4c04-dad4-85ec2d7fcd30" - }, - "source": [ - "%time\n", - "collection = db[\"Users\"]\n", - "with BulkOperation(collection, batchSize=500) as col:\n", - " #user_col = db.collections[\"Users\"]\n", - " for u, e in t10cu_emb.items():\n", - " the_key = u.split('/')[1]\n", - " the_user = collection[the_key]\n", - " the_user[\"n2v_emb\"] = e\n", - " the_user.patch()" - ], - "execution_count": 174, - "outputs": [ - { - "output_type": "stream", - "text": [ - "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\n", - "Wall time: 8.11 µs\n" - ], - "name": "stdout" - } - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "O81cxxoM4c9E" - }, - "source": [ - "# Store metadata about these experiments using Arangopipe" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "7VxWRZOy4c9E" - }, - "source": [ - "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n", - "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n", - "from arangopipe.arangopipe_storage.arangopipe_config import ArangoPipeConfig\n", - "from arangopipe.arangopipe_storage.managed_service_conn_parameters import ManagedServiceConnParam\n", - "mdb_config = ArangoPipeConfig()\n", - "msc = ManagedServiceConnParam()\n", - "conn_params = { msc.DB_SERVICE_HOST : \"arangoml.arangodb.cloud\", \\\n", - " msc.DB_SERVICE_END_POINT : \"createDB\",\\\n", - " msc.DB_SERVICE_NAME : \"createDB\",\\\n", - " msc.DB_SERVICE_PORT : 8529,\\\n", - " msc.DB_CONN_PROTOCOL : 'https'}\n", - " \n", - "mdb_config = mdb_config.create_connection_config(conn_params)\n", - "admin = ArangoPipeAdmin(reuse_connection = False, config = mdb_config)\n", - "ap_config = admin.get_config()\n", - "ap = ArangoPipe(config = ap_config)\n", - "proj_info = {\"name\": \"IMDB_Movie_Reviews\"}\n", - "proj_reg = admin.register_project(proj_info)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "LH3eVkTHM6x6", - "outputId": "80cae2cd-d390-4a7d-b734-5b1b7893367a" - }, - "source": [ - "mdb_config.get_cfg()" - ], - "execution_count": 183, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'arangodb': {'DB_end_point': 'createDB',\n", - " 'DB_service_host': 'arangoml.arangodb.cloud',\n", - " 'DB_service_name': 'createDB',\n", - " 'DB_service_port': 8529,\n", - " 'arangodb_replication_factor': None,\n", - " 'conn_protocol': 'https',\n", - " 'dbName': 'MLfihnc2v7dndo1zt7hvs27p',\n", - " 'password': 'MLi4rxshzlq3c00nwx63k60jaq',\n", - " 'username': 'MLu8bpng85889qv3msheqikc'},\n", - " 'mlgraph': {'graphname': 'enterprise_ml_graph'}}" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 183 - } - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "2vQCXknQ4c9F" - }, - "source": [ - "import io\n", - "import requests\n", - "url = ('https://raw.githubusercontent.com/arangoml/networkx-adapter/master/examples/IMDB_Networkx_Adapter.ipynb')\n", - "nbjson = requests.get(url).text\n", - "model_info = {\"name\": \"Exploratory Data Analysis IMDB\", \"task\": \"multiple\", 'notebook': nbjson}\n", - "model_reg = ap.register_model(model_info, project = \"IMDB_Movie_Reviews\")" - ], - "execution_count": 178, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "motmuZKd4c9F" - }, - "source": [ - "# Summary" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bkhLXelI4c9F" - }, - "source": [ - "* The Networkx-Adapter makes it easy to obtain Networkx graphs from ArangoDB graphs.\n", - "* We have demonstrated that using a graph representation helped us identify facts about the data. These facts were identified by leveraging standard ideas from Network Theory.\n", - "* We have leveraged the Networkx graph to obtain a _Node2vec_ embedding of the graph associated with our data. For an example of how the Networkx-Adapter can be used with a _deep learning_ library, please view the [ITSM ArangoDB Adapter](https://github.com/arangoml/networkx-adapter/blob/master/examples/ITSM_ArangoDB_Adapter.ipynb). " - ] - } - ] -} \ No newline at end of file + "base_uri": "https://localhost:8080/" + }, + "id": "9voIoaGRS0cB", + "outputId": "441a4cd7-6eb6-4664-d97e-fc625fdb6c6f" + }, + "outputs": [], + "source": [ + "from adbnx_adapter.imdb_arangoDB_networkx_adapter import IMDBArangoDB_Networkx_Adapter\n", + "import oasis\n", + "con = oasis.getTempCredentials()\n", + "\n", + "print()\n", + "print(\"https://{}:{}\".format(con[\"hostname\"], con[\"port\"]))\n", + "print(\"Username: \" + con[\"username\"])\n", + "print(\"Password: \" + con[\"password\"])\n", + "print(\"Database: \" + con[\"dbName\"])\n", + "\n", + "\n", + "ma = IMDBArangoDB_Networkx_Adapter(conn=con)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "APNXpksoRTif" + }, + "source": [ + "## Create the Collections for the Database " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RQAoZi3AW9ru" + }, + "outputs": [], + "source": [ + "# Connect to the temp database\n", + "conn = oasis.connect(con)\n", + "db = conn[con[\"dbName\"]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f0nSSiRnRTit" + }, + "source": [ + "## Import the Data " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sE_jzJHfASIR", + "outputId": "c5dd5bf3-447d-4e6a-f3d7-74ae707bd07b" + }, + "outputs": [], + "source": [ + "! ./tools/arangorestore -c none --server.endpoint http+ssl://{con[\"hostname\"]}:{con[\"port\"]} --server.username {con[\"username\"]} --server.database {con[\"dbName\"]} --server.password {con[\"password\"]} --default-replication-factor 3 --input-directory \"./IMDBdata/data/imdb_with_ratings\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tDmWTWYkW6VW", + "outputId": "490fb28d-d136-43f2-e067-dd540744c4bc" + }, + "outputs": [], + "source": [ + "import csv\n", + "import json\n", + "import requests\n", + "import sys\n", + "import oasis\n", + "from pyArango.connection import *\n", + "from pyArango.collection import Collection, Edges, Field\n", + "from pyArango.graph import Graph, EdgeDefinition\n", + "\n", + "class Users(Collection):\n", + " _fields = {\n", + " \"user_id\": Field(),\n", + " \"age\": Field(),\n", + " \"gender\": Field()\n", + " }\n", + " \n", + "class Movies(Collection):\n", + " _fields = {\n", + " \"movie_id\": Field(),\n", + " \"movie_title\": Field(),\n", + " \"release_data\": Field()\n", + " }\n", + "\n", + "class Ratings(Edges): \n", + " _fields = {\n", + " #user_id and movie_id (_key of movie) are encoded by _from, _to \n", + " \"rating\": Field(),\n", + " \"timestamp\": Field()\n", + " }\n", + "\n", + "class IMDBGraph(Graph):\n", + " _edgeDefinitions = [EdgeDefinition(\"Ratings\", fromCollections=[\"Users\"], toCollections=[\"Movies\"])]\n", + " _orphanedCollections = []\n", + "\n", + "iMDBGraph = db.createGraph(\"IMDBGraph\", replicationFactor=3)\n", + "\n", + "print(\"Collection/Graph Setup done.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5rm-ZpNQ4c87" + }, + "source": [ + "# Use the Networkx-Adapter to create a Networkx Graph" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i0CYXcbU4c87" + }, + "source": [ + "Now that we have loaded the data into ArangoDB, we can use the __Networkx-Adapter__ to create _Networkx_ graphs from the ArangoDB graph. To do so, we'll have to provide a graph descriptor that describes the graph structure to the __Networkx-Adapter__. These steps are illustrated below. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tua4CIHYRTi0" + }, + "source": [ + "## Specify the Graph Structure " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9oKkkRxwRTi0" + }, + "source": [ + "To use the IMDB Networkx Adapter, we need to specify the structure of the graph that we want to create. This is done with a simple dictionary. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wobwe8KqXXi2" + }, + "outputs": [], + "source": [ + "imdb_attributes = {'vertexCollections': {'Users': {},\n", + " 'Movies': {}},\n", + " 'edgeCollections': {'Ratings': {'_from', '_to', 'ratings'}}}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ORqpcKl_4c87" + }, + "source": [ + "## Obtain the networkx graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jjJZJEgvXZ0W" + }, + "outputs": [], + "source": [ + "g = ma.create_networkx_graph(\n", + " graph_name='IMDBGraph', graph_attributes=imdb_attributes)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Rk-La15l4c88" + }, + "source": [ + "__Done!__, we now have a _Networkx_ graph representation that we can use for analysis!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a3dOVYck4c88" + }, + "source": [ + "# Analysis of the IMDB reviews dataset with Networkx\n", + "We just created a networkx graph for this problem. In this notebook we will use a small set of ideas from the standard body of ideas used in network analysis to extract insights from the movie review dataset. Networkx provides an implementation of these ideas. These will be discussed next.\n", + "\n", + "As mentioned earlier, one of the advantages of using a graph representation is that we can leverage the standard body of ideas used to analyze networks to extract information about this dataset. In what follows, we will call out such facts as we identify them.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zj_ZCYrt4c88" + }, + "source": [ + "## Get the user and movie nodes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yqmOgDuN4c88" + }, + "outputs": [], + "source": [ + "user_nodes = [n for n in g.nodes() if n.startswith(\"Users\")]\n", + "movie_nodes = [n for n in g.nodes() if n.startswith(\"Movies\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZMA_ffth4c89" + }, + "source": [ + "## Structural Property Introspection: Number of Nodes and Edges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1wjglzLW4c89", + "outputId": "8c39721f-3bfe-422a-a0ac-8275d037c456" + }, + "outputs": [], + "source": [ + "print(\"Number of Users are %d\" % (len(user_nodes)))\n", + "print(\"Number of Movies are %d\" % (len(movie_nodes)))\n", + "print(\"Number of Ratings are %d\" % (len(list(g.edges()))))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7YAkup5g4c89" + }, + "source": [ + "## Convert the graph obtained from the interface to a bi-partite graph " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Le8HMfU-4c89" + }, + "outputs": [], + "source": [ + "import networkx as nx\n", + "B = nx.Graph()\n", + "B.add_nodes_from(user_nodes, bipartite=0)\n", + "B.add_nodes_from(movie_nodes, bipartite=1)\n", + "B.add_edges_from(list(g.edges()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZaBJqyZy4c89" + }, + "source": [ + "### Identified fact:\n", + "_The graph has two kinds of edges. It has 943 users and and 1682 Movies. A user may watch multiple movies. 65499 movie ratings are available. This information is obtained by simply identifying the structure of the graph._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ybp1eslr4c89", + "outputId": "9e307259-52a8-4172-8725-6d7cde71ea80" + }, + "outputs": [], + "source": [ + "from networkx.algorithms import bipartite\n", + "bipartite.is_bipartite(B)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RcV_-8Dj4c89" + }, + "source": [ + "## Is the graph connected?\n", + "If the graph is connected then there is a path between any two nodes in the graph. If not, then some nodes are not connected. In the context of this application, a disconnected graph implies that there exist users in the graph who may not have rated a movie in common. If it is true that any given pair of users have rated at least one movie in common, then there is a path connecting these users through the movie they have rated in common." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KIYLmAtj4c8-" + }, + "source": [ + "## Identified fact:\n", + "_In the user community, if we pick any two users, then it is possible that they may not have rated a movie in common._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Dr749ocn4c8-", + "outputId": "e4e91970-7275-4e6c-d1b6-589b7cb79039" + }, + "outputs": [], + "source": [ + "nx.is_connected(B)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1nNjYdTp4c8-" + }, + "outputs": [], + "source": [ + "users = [n for n in B.nodes if B.nodes[n]['bipartite'] == 0]\n", + "movies = [n for n in B.nodes if B.nodes[n]['bipartite'] == 1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pWP1Tm0Z4c8-" + }, + "source": [ + "## How many movies does a User rate?\n", + "This is a distribution. A review of the plot below will show the familiar long tailed distribution. Most viewers rate less than say 100 movies. There are some serious reviewers out there though. These are on the tail of the distribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nW4ztdKR4c8-" + }, + "outputs": [], + "source": [ + "degu, degm = bipartite.degrees(B, movies)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xEkEriFa4c8-" + }, + "outputs": [], + "source": [ + "du = [v for k,v in degu]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 355 + }, + "id": "iMlIdvIp4c8-", + "outputId": "40bdc0d7-afe6-4d22-b26c-a9d7b305c696" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.rcParams['figure.figsize'] = [10, 5]\n", + "ax = plt.hist(du)\n", + "plt.title(\"Histogram of the Number of Movies Viewed\")\n", + "plt.xlabel(\"Movies Viewed\")\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ePYKt-a4c8-" + }, + "source": [ + "### Identified fact:\n", + "_Inspecting the degree distribution of vertices is an activity we commonly perform to understand some basic characteristics of the network. In this application, this activity shows that most user's rate less than a 100 movies. The graph above is an example of \"long tailed distribution\". This is commonly seen in social networks._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TLGsHDx24c8-" + }, + "source": [ + "## How many users rate a movie?\n", + "This is also a distribution that also has the long tailed behavior. Most mvoies are rated by less than 100 users. There are some movies that are rated by many viewers though." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 355 + }, + "id": "zCdf0HX34c8_", + "outputId": "0b0a31eb-bac7-400f-da72-b72020f0bcaa" + }, + "outputs": [], + "source": [ + "dm = [v for k,v in degm]\n", + "plt.rcParams['figure.figsize'] = [10, 5]\n", + "ax = plt.hist(dm)\n", + "plt.title(\"Histogram of the Number of Viewers Rating a Movie\")\n", + "plt.xlabel(\"Number of Viewers\")\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "43HtTUpU4c8_" + }, + "source": [ + "### Identified fact:\n", + "_We can repeat the exploration of degree distribution with the movie nodes. The behavior observed with the movie nodes is similar to what is observed with the user nodes. We observe the same \"long tailed\" distribution and most movies are rated by less than a 100 users._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mNf2j7U14c8_" + }, + "source": [ + "## Centrality Measures\n", + "\n", + "A centrality measure identifies influential nodes in a network. How do we formalize the observations about movie importance and the user's rating behavior in the given network? The notion of centrality measure is useful in this regard. Many measures of node centrality are used, see [Chapter 3 of the text book by Zafarani et.al](http://dmml.asu.edu/smm/) for details. We will use degree centraility in this example (see the [documentation](https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.centrality.degree_centrality.html) for details). In the illustrations below, we will capture the top 10 users and movies in terms of importance as determined by this measure. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BsHe6G_N4c8_" + }, + "outputs": [], + "source": [ + "dc= bipartite.degree_centrality(B, users)\n", + "sdcu = {}\n", + "sdcm = {}\n", + "for k, v in sorted(dc.items(),reverse=True, key=lambda item: item[1]):\n", + " if k.startswith(\"Users\"):\n", + " sdcu[k] = v\n", + " else:\n", + " sdcm[k] = v\n", + "\n", + "del dc" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Uf4MZBHe4c8_" + }, + "source": [ + "## List the top 10 users in terms of degree centrality\n", + "These guys rate a lot of movies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "N64DiO3H4c8_", + "outputId": "3aba9db1-4cb3-4350-fe55-993786eb8f3d" + }, + "outputs": [], + "source": [ + "list(sdcu.keys())[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "caKNxjrP4c8_" + }, + "source": [ + "## List the top 10 movies in terms of degree centrality\n", + "These movies are rated by a lot of people" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Gswkyc5f4c8_", + "outputId": "af141f85-b4b4-4b24-c098-2bdfb88da4f0" + }, + "outputs": [], + "source": [ + "list(sdcm.keys())[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CeHGqYwX4c8_" + }, + "source": [ + "## How does this importance measure vary over the user community?\n", + "We can get a sense of this by viewing the distribution of this measure." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 604 + }, + "id": "0pOqgjAp4c9A", + "outputId": "a3abe1e1-e275-46fa-85a0-a1b64d0ec7a6" + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "npvals = np.fromiter(sdcu.values(), dtype=float)\n", + "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", + "ax = sns.distplot(npvals, hist = False)\n", + "plt.title(\"Distribution of User Degree Centrality\")\n", + "plt.xlabel(\"User Degree Centraility\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kbzu3I_q4c9A" + }, + "source": [ + "## How does this importance measure vary over the movie community?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 604 + }, + "id": "LQ28qv9y4c9A", + "outputId": "e0b35ddc-0ce8-4ef6-ff11-b5ec7d2115db" + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "npvals = np.fromiter(sdcm.values(), dtype=float)\n", + "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", + "ax = sns.distplot(npvals, hist = False)\n", + "plt.title(\"Distribution of Movie Degree Centrality\")\n", + "plt.xlabel(\"Movie Degree Centraility\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "z1GwPZIg4c9A" + }, + "source": [ + "### Identified fact:\n", + "_We have a measure to capture the importance of a movie and a user on the basis of the rating activity performed on them. We have identified important users and movies in the network on the basis of this measure. We have also have information about how this importance measure varies over the users and movies in the data._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x1vtgYbAXpec" + }, + "outputs": [], + "source": [ + "m4v = [t[0] for t in g.in_edges('Movies/4')]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sO21vdXv4c9A" + }, + "source": [ + "## Similarity of nodes\n", + "Just like measures exist to quantify the importance of nodes in a network, there exist measures to quantify the similarity of nodes in a network. We will pick a random node in the network and characterize the similarity of nodes connected to it." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y4W1MtL-RTjK" + }, + "source": [ + "## How similar are viewers of the movie 'Get Shorty'?\n", + "In this example, we will use the __Jaccard__ similarity as a measure of node similarity. We first get all pairs of users who have seen the movie and then compute the __Jaccard__ similarity between them. The details are shown below. Networkx provides an API for this purpose that we will use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z4WoMVchXtVq" + }, + "outputs": [], + "source": [ + "from itertools import combinations\n", + "m4vucmb = list(combinations(m4v, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3Z4VBUvkXyHl" + }, + "outputs": [], + "source": [ + "import networkx as nx\n", + "gp = g.to_undirected()\n", + "jcp = nx.jaccard_coefficient(gp, m4vucmb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f2i7Puh9X14N" + }, + "outputs": [], + "source": [ + "jcpv = []\n", + "for u, v, p in jcp:\n", + " jcpv.append(p)\n", + " #print('(%s, %s) -> %.8f' % (u, v, p))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-L1O0rhe4c9B" + }, + "source": [ + "## What does the distribution of User Similarity look like?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 604 + }, + "id": "ANP7CmNj4c9B", + "outputId": "0d78641c-b03c-48ff-e3a8-76a14d6719cf" + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", + "ax = sns.distplot(jcpv, hist = False)\n", + "plt.title(\"Distribution of Jaccard Similarity between Raters of Get Shorty\")\n", + "plt.xlabel(\"Jaccard Similarity\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dRNe85k54c9B" + }, + "source": [ + "### Identified fact:\n", + "_Picking a random node in the graph, we characterized the similarity of nodes connected to it. In this dataset, we can use this idea to get a sense of how similar user's rating a movie tend to be. A review of the above figure shows that a range of similarities are observed._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1SJKyWDe4c9B" + }, + "source": [ + "## Bipartite Clustering\n", + "The next idea that we will investigate on the graph comes from [Latapy et.al](https://arxiv.org/abs/cond-mat/0611631). In this work, the authors extend the notion of clustering coefficients associated with a node to the case of bi-partite graphs. Since our graph is a bipartite graph, we will use this metric. See section VIII of the paper for a detailed explanation of the idea. Briefly, the clustering coefficient for a pair of nodes $u, v$, is defined as follows:\n", + "$$\n", + "\\begin{equation*}\n", + "cc(u,v) = \\frac{N(u)\\cap N(v)}{N(u) \\cup N(v)},\n", + "\\end{equation*}\n", + "$$\n", + "where:\n", + "\n", + "* N(u) is the neighborhood of node $u$. These are the nodes connected to $u$. For example, the movies user $u$ rates.\n", + "* N(v) is the neighborhood of node $v$.\n", + "Reviewing the above equation, it is evident that the clustering coefficient captures the shared neighborhood for a pair of nodes. Nodes associated with high clustering coefficient have many common neighbors. In the context of this problem, a pair of users with a high clustering coefficient suggests that these users have rated many movies in common. Similarly, a pair of movies with a high clustering coefficient suggests that these movies there are many common raters(users) for these movies. Networkx provides an implementation of this idea. The details of computing the clustering coefficients for the users and movies are shown below.\n", + "To obtain the clustering coefficient for a node $u$, we simply average the clustering coefficient of $u$ with other nodes that it shares common neighbors with. It is defined as follows:\n", + "\n", + "$$\n", + "\\begin{equation*}\n", + "cc(u) = \\frac{\\sum_{v \\in N(u)} cc(u,v)}{\\left| N(N(u) \\right|},\n", + "\\end{equation*}\n", + "$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "u7o7oLkC4c9B" + }, + "outputs": [], + "source": [ + "cr = bipartite.clustering(B)\n", + "cu = {}\n", + "cm = {}\n", + "for k, v in sorted(cr.items(),reverse=True, key=lambda item: item[1]):\n", + " if k.startswith(\"Users\"):\n", + " cu[k] = v\n", + " else:\n", + " cm[k] = v\n", + "\n", + "del cr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ATmq_-aK4c9B" + }, + "source": [ + "## List the top 10 users in terms of clustering tendencies \n", + "These users share a high degree of common movie rating activity with other users. If rating is used as a proxy for the act of liking or disliking a movie, then user's with high clustering values share rating activity with " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HX9colNZ4c9B", + "outputId": "660accba-5da0-47ba-b479-17667dc94eb9" + }, + "outputs": [], + "source": [ + "list(cu.keys())[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gE83U_Am4c9B" + }, + "source": [ + "## List the top 10 movies in terms of clustering tendencies \n", + "These movies have a high degree of common users rating them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Av0ToBrp4c9C", + "outputId": "c97d33bd-1940-40e6-da2e-f2386753712b" + }, + "outputs": [], + "source": [ + "list(cm.keys())[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BgGI3bv04c9C" + }, + "source": [ + "## How does user clustering tendency vary over the community?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 604 + }, + "id": "JNGtK0z84c9C", + "outputId": "46a87af3-a70f-4613-fc00-a18290bde79e" + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "npvals = np.fromiter(cu.values(), dtype=float)\n", + "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", + "ax = sns.distplot(npvals, hist = False)\n", + "plt.title(\"Distribution of User Clustering Coefficients\")\n", + "plt.xlabel(\"User Clustering Coefficients\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EjfGafPm4c9C" + }, + "source": [ + "### Note:\n", + "Note that the distribution of user clustering coefficients has two humps (bi-modal). This suggests that there are two user communities. This is borne out when we visualize a sample of the user community using a _tSNE_ plot later in this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SPxiPAjm4c9C" + }, + "source": [ + "## How does movie clustering tendency vary over the community?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 604 + }, + "id": "enjjfIib4c9C", + "outputId": "bf997c79-d536-42e9-dd90-1263cfea1a0b" + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "import numpy as np\n", + "npvals = np.fromiter(cm.values(), dtype=float)\n", + "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", + "ax = sns.distplot(npvals, hist = False)\n", + "plt.title(\"Distribution of Movie Clustering Coefficients\")\n", + "plt.xlabel(\"Movie Clustering Coefficients\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0ICweEaZ4c9C" + }, + "source": [ + "### Note:\n", + "Note that the distribution of user clustering coefficients has three humps. This suggests that there are three movie clusters. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b26SbNVt4c9C" + }, + "source": [ + "### Identified fact:\n", + "_We have identified users and movies that tend to have have a lot of common rating activity. We have identified that users and movies have clustering tendencies._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hJmuEFFx4c9C" + }, + "source": [ + "# Using Node2vec to obtain a graph embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nc-VtRGO4c9D" + }, + "source": [ + "Machine learning techniques can be applied on graphs to determine vector representations of graph elements such as nodes and edges. Such a representation is called an _embedding_. We can use the Networkx representation of the graph associated with the 10 most clustered users and determine its embedding. We can use a technique called [_t Stochastic Neighbor Embedding_](https://www.youtube.com/watch?v=RJVL80Gg3lA&list=UUtXKDgv1AVoG88PLl8nGXmw) to plot a two dimensional representation of this subgraph. The details of doing this is provided below. Earlier we noted that the user clustering tendencies were bi-modal. A review of the embedding of the sub-graph associated with the 10 most clustered users shows that there are two clusters in the data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kHWs_A1X4c9D" + }, + "source": [ + "## Extract the sub-graph associated with the 10 most clustered users" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IoGJKmJm4c9D" + }, + "outputs": [], + "source": [ + "t10cu = list(cu.keys())[:10]\n", + "proj_user = nx.bipartite.projected_graph(B, t10cu)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qyZwFHnL4c9D", + "outputId": "2d8eaf1e-b445-430e-e675-6eb261884b11" + }, + "outputs": [], + "source": [ + "len(list(proj_user.edges()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tLgoFfvf4c9D", + "outputId": "68dd10d8-d483-4ed8-af2c-cca5afdb845c" + }, + "outputs": [], + "source": [ + "len(proj_user.nodes())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Jv5XKBLfRTjd" + }, + "source": [ + "## Embed the sub-graph using Node2vec " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 66, + "referenced_widgets": [ + "8c4c63c2ec584e8694949cde2b878278", + "6276d89c53f1452b82fed9c2af7e0236", + "e7a24632f5fb4670b7fd8945c8e64ce7", + "b9d9e6899f234dc381a947f25edd0b1f", + "7d125a00ff3046aab6c2dbd1b824ee5f", + "a61b9cb46d724bf8bdb24de5a9592017", + "9d1e1bb9a456457a87bb0dcb440f76dc", + "914e350efb3e4f4fbb1baf94fb4134a4" + ] + }, + "id": "e-fD-eNhX9Lh", + "outputId": "a2ffbdc5-860e-4e04-8e92-59ea7d3ff1e6" + }, + "outputs": [], + "source": [ + "from node2vec import Node2Vec\n", + "node2vec = Node2Vec(proj_user, dimensions=64, walk_length=10, num_walks=100, workers=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6hu6Z0VPYDFK" + }, + "outputs": [], + "source": [ + "model = node2vec.fit(window=10, min_count=1, batch_words=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "kUN1zLMG4c9D", + "outputId": "fd0aaad1-155c-45a2-d6b8-6f48c14920d0" + }, + "outputs": [], + "source": [ + "t10cu[2]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0NMFl0dD4c9D" + }, + "source": [ + "## Apply the model\n", + "We can ask questions like \"Who is most similar to (a particular user) Users/64?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "41YVYl3jYHia", + "outputId": "02c9036c-cf25-4018-c742-ca595aa91b83" + }, + "outputs": [], + "source": [ + "model.wv.most_similar(t10cu[2])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEgDZqYb4c9E" + }, + "source": [ + "## Embed the graph with tSNE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Ojg_VZVP4c9E" + }, + "outputs": [], + "source": [ + "t10cu_emb = { n: list(model.wv.get_vector(n)) for n in proj_user.nodes()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 604 + }, + "id": "joYmsa3X4c9E", + "outputId": "a0e839c9-914d-4e0b-8499-37cb669f1e04" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from matplotlib import cm\n", + "from sklearn.manifold import TSNE\n", + "fig = plt.figure(figsize=(11.7,8.27))\n", + "plt.grid(True)\n", + "plt.xlabel(\"Comp 1\")\n", + "plt.ylabel(\"Comp 2\")\n", + "plt.title(\"tSNE plot of subgraph of top 10 most clustered users\")\n", + "t10cu_embedded = TSNE(n_components=2).fit_transform(list(t10cu_emb.values()))\n", + "sns.scatterplot(t10cu_embedded[:,0], t10cu_embedded[:,1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "97aU0nWw4c9E" + }, + "source": [ + "__Using the Networkx adapter we have been able to perform a range of analysis tasks using the Networkx API. We were also able use the Networkx graph to obtain an embedding of a graph using Node2vec.__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dA_np4UI4c9E" + }, + "source": [ + "## Store Embeddings in ArangoDB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "uzMUv9Ol4c9E", + "outputId": "6a171f56-f70c-4c04-dad4-85ec2d7fcd30" + }, + "outputs": [], + "source": [ + "%time\n", + "collection = db[\"Users\"]\n", + "with BulkOperation(collection, batchSize=500) as col:\n", + " #user_col = db.collections[\"Users\"]\n", + " for u, e in t10cu_emb.items():\n", + " the_key = u.split('/')[1]\n", + " the_user = collection[the_key]\n", + " the_user[\"n2v_emb\"] = e\n", + " the_user.patch()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O81cxxoM4c9E" + }, + "source": [ + "# Store metadata about these experiments using Arangopipe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7VxWRZOy4c9E" + }, + "outputs": [], + "source": [ + "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n", + "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n", + "from arangopipe.arangopipe_storage.arangopipe_config import ArangoPipeConfig\n", + "from arangopipe.arangopipe_storage.managed_service_conn_parameters import ManagedServiceConnParam\n", + "mdb_config = ArangoPipeConfig()\n", + "msc = ManagedServiceConnParam()\n", + "conn_params = { msc.DB_SERVICE_HOST : \"arangoml.arangodb.cloud\", \\\n", + " msc.DB_SERVICE_END_POINT : \"createDB\",\\\n", + " msc.DB_SERVICE_NAME : \"createDB\",\\\n", + " msc.DB_SERVICE_PORT : 8529,\\\n", + " msc.DB_CONN_PROTOCOL : 'https'}\n", + " \n", + "mdb_config = mdb_config.create_connection_config(conn_params)\n", + "admin = ArangoPipeAdmin(reuse_connection = False, config = mdb_config)\n", + "ap_config = admin.get_config()\n", + "ap = ArangoPipe(config = ap_config)\n", + "proj_info = {\"name\": \"IMDB_Movie_Reviews\"}\n", + "proj_reg = admin.register_project(proj_info)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LH3eVkTHM6x6", + "outputId": "80cae2cd-d390-4a7d-b734-5b1b7893367a" + }, + "outputs": [], + "source": [ + "mdb_config.get_cfg()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2vQCXknQ4c9F" + }, + "outputs": [], + "source": [ + "import io\n", + "import requests\n", + "url = ('https://raw.githubusercontent.com/arangoml/networkx-adapter/master/examples/IMDB_Networkx_Adapter.ipynb')\n", + "nbjson = requests.get(url).text\n", + "model_info = {\"name\": \"Exploratory Data Analysis IMDB\", \"task\": \"multiple\", 'notebook': nbjson}\n", + "model_reg = ap.register_model(model_info, project = \"IMDB_Movie_Reviews\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "motmuZKd4c9F" + }, + "source": [ + "# Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bkhLXelI4c9F" + }, + "source": [ + "* The Networkx-Adapter makes it easy to obtain Networkx graphs from ArangoDB graphs.\n", + "* We have demonstrated that using a graph representation helped us identify facts about the data. These facts were identified by leveraging standard ideas from Network Theory.\n", + "* We have leveraged the Networkx graph to obtain a _Node2vec_ embedding of the graph associated with our data. For an example of how the Networkx-Adapter can be used with a _deep learning_ library, please view the [ITSM ArangoDB Adapter](https://github.com/arangoml/networkx-adapter/blob/master/examples/ITSM_ArangoDB_Adapter.ipynb). " + ] + } + ], + "metadata": { + "colab": { + "name": "IMDB_Networkx_Adapter.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/ITSM_ArangoDB_Adapter.ipynb b/examples/ITSM_ArangoDB_Adapter.ipynb index eb71f4b2..b840d7e3 100644 --- a/examples/ITSM_ArangoDB_Adapter.ipynb +++ b/examples/ITSM_ArangoDB_Adapter.ipynb @@ -3,27 +3,22 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", - "id": "view-in-github" + "id": "MjUp8oZA_0EL" }, "source": [ - "\"Open" + "# Predicting IT Service Ticket Reassingnment Using RGCN (DGL)" ] }, { "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "MjUp8oZA_0EL" - }, + "metadata": {}, "source": [ - "# Predicting IT Service Ticket Reassingnment Using RGCN (DGL)" + "\"Open" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "e6qWopH1_0EN" }, "source": [ @@ -33,7 +28,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "K6LQVNyO_0EQ" }, "source": [ @@ -44,18 +38,18 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "FF_RsnbzZsAM" }, "outputs": [], "source": [ "%%capture cap_out --no-stderr\n", - "!git clone -b master https://github.com/arangoml/networkx-adapter.git\n", - "!rsync - av networkx-adapter/examples / . / --exclude = .git\n", + "!git clone -b oasis_connector --single-branch https://github.com/arangodb/interactive_tutorials.git\n", + "!git clone -b 0.0.0.2.5.3 --single-branch https://github.com/arangoml/networkx-adapter.git\n", + "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n", + "!rsync -av interactive_tutorials/ ./ --exclude=.git\n", + "!pip3 install adbnx_adapter=0.0.0.2.5.3\n", "!pip3 install networkx\n", "!pip3 install matplotlib\n", - "!pip install --index-url https://test.pypi.org/simple/ adbnx-adapter==0.0.0.2.5.3\n", "!pip3 install pyarango\n", "!pip3 install python-arango\n", "!pip install dgl" @@ -65,8 +59,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "Ly7pC2qEAjdL" }, "outputs": [], @@ -78,7 +70,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "50Xg1Wz1_0Eb" }, "source": [ @@ -88,7 +79,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "punGc278_0Ec" }, "source": [ @@ -99,8 +89,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "NhIdp2rNaos7" }, "outputs": [], @@ -118,7 +106,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "JWWsUIV0_0Ei" }, "source": [ @@ -128,7 +115,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "_HoUDu8R_0Ej" }, "source": [ @@ -139,20 +125,17 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "o5Q1aESiatNB" }, "outputs": [], "source": [ - "\n", + "!chmod -R 755 ./tools\n", "!./tools/arangorestore -c none --server.endpoint http+ssl://{con[\"hostname\"]}:{con[\"port\"]} --server.username {con[\"username\"]} --server.database {con[\"dbName\"]} --server.password {con[\"password\"]} --default-replication-factor 3 --input-directory \"data/dgl_data_dump\"" ] }, { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "ZqwkRxVx_0Eo" }, "source": [ @@ -162,7 +145,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "b4c-5Cpl_0Ep" }, "source": [ @@ -173,8 +155,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "5C-NW4amZ815" }, "outputs": [], @@ -195,7 +175,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "R8KCUbNi_0Eu" }, "source": [ @@ -206,8 +185,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "3qACbcQBbLEx" }, "outputs": [], @@ -220,8 +197,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "yBIgn6fGbPo1" }, "outputs": [], @@ -233,7 +208,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "SvY1SS3u_0E3" }, "source": [ @@ -244,8 +218,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "5KL6fN4wbfW3" }, "outputs": [], @@ -259,7 +231,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "6uMb3qm6_0E8" }, "source": [ @@ -269,7 +240,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "iTqlOrYA_0E8" }, "source": [ @@ -280,8 +250,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "UyDWiVL8bSny" }, "outputs": [], @@ -388,8 +356,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "Mjsfj08cboEx" }, "outputs": [], @@ -415,7 +381,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "SqMjZoSO_0FE" }, "source": [ @@ -426,8 +391,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "z2YZAGfgbsSy" }, "outputs": [], @@ -443,7 +406,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "oJIloQV-_0FI" }, "source": [ @@ -454,8 +416,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "XH-d0z3ibzJ0" }, "outputs": [], @@ -497,7 +457,6 @@ { "cell_type": "markdown", "metadata": { - "colab_type": "text", "id": "_B6fTCvt_0FM" }, "source": [ @@ -509,8 +468,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "zZucEAm4_0FM" }, "outputs": [], @@ -523,8 +480,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "OsJyZCgz_0FQ" }, "outputs": [], @@ -537,8 +492,6 @@ "cell_type": "code", "execution_count": null, "metadata": { - "colab": {}, - "colab_type": "code", "id": "Bto7Q0qa_0FT" }, "outputs": [], @@ -549,7 +502,6 @@ ], "metadata": { "colab": { - "include_colab_link": true, "name": "ITSM_ArangoDB_Adapter.ipynb", "provenance": [] }, @@ -572,5 +524,5 @@ } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 0 } diff --git a/examples/ITSM_EDA.ipynb b/examples/ITSM_EDA.ipynb index 2bdd4374..00dc7f9c 100644 --- a/examples/ITSM_EDA.ipynb +++ b/examples/ITSM_EDA.ipynb @@ -1,308 +1,275 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "colab": { - "name": "ITSM_EDA.ipynb", - "provenance": [], - "include_colab_link": true - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "EdwmCDNMvacj" + }, + "source": [ + "\"Open" + ] }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RjBZf-SVdrga", - "colab_type": "text" - }, - "source": [ - "## Install Required Libraries" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "baCrm4a-dpUq", - "colab_type": "code", - "colab": {} - }, - "source": [ - "%%capture\n", - "!git clone -b doc_updates_nx https://github.com/arangoml/networkx-adapter.git\n", - "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n", - "!pip3 install networkx\n", - "!pip3 install matplotlib\n", - "!pip3 install --index-url https://test.pypi.org/simple/ adbnx-adapter==0.0.0.2.5.3\n", - "!pip3 install pyarango\n", - "!pip3 install python-arango\n" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jL2HuHIqdk2W", - "colab_type": "text" - }, - "source": [ - "## Data Characteristics" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BPqAGfABdk2X", - "colab_type": "text" - }, - "source": [ - "The data is an event log that was extracted from the audit system of a __ServiceNow__ platform (this is an enterprise service help desk application). The data is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Incident+management+process+enriched+event+log) (please visit the link for more details). This notebook captures the salient aspects of exploratory analysis of this dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-0zJY4eldk2a", - "colab_type": "text" - }, - "source": [ - "## Read the data" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "eAMWQcO3dk2b", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import pandas as pd\n", - "fp = \"data/incident_event_log.csv\"\n", - "df = pd.read_csv(fp)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DKTpEAXJdk2g", - "colab_type": "text" - }, - "source": [ - "## What are the main characteristics?\n", - "1. What does a sample of the dataset look like?\n", - "2. How many incidents are reported in this dataset?" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "4HT6rg9hdk2h", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "_3onCkdsdk2m", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df['number'].nunique()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QQkLOWkFdk2q", - "colab_type": "text" - }, - "source": [ - "## List the data types of the various attributes" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "wspAzvw6dk2r", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df.dtypes" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bc6SO7Jkdk2v", - "colab_type": "text" - }, - "source": [ - "## Convert the $\\texttt{sys_updated_at}$ attribute to be a timestamp" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "3HpATu15dk2w", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df['sys_updated_at'] = pd.to_datetime(df['sys_updated_at'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yGeJtQ94dk20", - "colab_type": "text" - }, - "source": [ - "## Machine Learning Task for this Dataset " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n3EsAxURdk20", - "colab_type": "text" - }, - "source": [ - "The contributors of this dataset have used this data to predict the time to resolution of the ticket. This data has been used for a classification task in this work. A [graph convolutional network for relational data(GCN)](https://arxiv.org/abs/1703.06103) will be the machine learning task for this work. We will be using a __GCN__ to predict the property of a particular node. What property would be useful to predict ? What are the characteristics of this property in the data? The cells below explore these questions. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yGxcIAlIdk21", - "colab_type": "text" - }, - "source": [ - "### Explore candidate list of tags\n", - "Note: For the experiment, we will pick a tag that is fairly evenly distributed in the data. This will avoild the imbalanced classs label problem." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "ofWeKiuWdk22", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dfcc = df[['made_sla', 'urgency', 'impact', 'reassignment_count']]\n", - "for c in dfcc.columns.tolist():\n", - " print(str(dfcc[c].value_counts()))" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gKV5JEUUdk28", - "colab_type": "text" - }, - "source": [ - "A review of the level counts of the categorical variables in this dataset suggest that $\\texttt{made_sla}$ and $\\texttt{urgency}$ are both highly imbalanced. The minority levels are almost anomalies. The $\\texttt{reassignment_count}$ seems promising. We can derive a new attribute $\\texttt{reassigned}$ that captures if the ticket has been reassigned, i.e., has it been assigned to someone after the initial assignment. Such an attribute captures inefficiencies in triaging the ticket and is a useful indicator to track for an organization. A $0$ for this attribute indicates that there was no reassignment and a $1$ indicates that there was a reassignment. This attribute has a nice even spread in the data, i.e., an almost even spread of $0$ and $1$. The cells below create this attribute" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CYGI6SW5dk29", - "colab_type": "text" - }, - "source": [ - "## Feature Creation (reassigned):\n", - "It looks like tracking ticket reassignment can create a variable that is somewhat evenly distributed in the data. About half the tickets have the correct assignment at first. About half are reassigned to various degrees." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "XgFfomD6dk2-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df['reassigned'] = df['reassignment_count'].apply(lambda x: 0 if x == 0 else 1)\n", - "df['reassigned'].value_counts()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "nb44FIQZdk3B", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dfpp = df.loc[df.groupby(by=['number']).sys_updated_at.idxmax()]\n", - "dfpp = dfpp.reset_index()\n", - "cols = dfpp.columns.tolist()\n", - "cols.remove('index')\n", - "cols.remove('number')\n", - "dfpp = dfpp[cols]" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sFYjOPcRdk3E", - "colab_type": "text" - }, - "source": [ - "Now that we have characterized the data and identified the machine learning task to be performed. The next step is to transform the data to a form amenable for machine learning. " - ] - } - ] -} \ No newline at end of file + { + "cell_type": "markdown", + "metadata": { + "id": "RjBZf-SVdrga" + }, + "source": [ + "## Install Required Libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "baCrm4a-dpUq" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!git clone -b 0.0.0.2.5.3 https://github.com/arangoml/networkx-adapter.git\n", + "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n", + "!pip3 install adbnx-adapter==0.0.0.2.5.3\n", + "!pip3 install networkx\n", + "!pip3 install matplotlib\n", + "!pip3 install pyarango\n", + "!pip3 install python-arango" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jL2HuHIqdk2W" + }, + "source": [ + "## Data Characteristics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BPqAGfABdk2X" + }, + "source": [ + "The data is an event log that was extracted from the audit system of a __ServiceNow__ platform (this is an enterprise service help desk application). The data is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Incident+management+process+enriched+event+log) (please visit the link for more details). This notebook captures the salient aspects of exploratory analysis of this dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-0zJY4eldk2a" + }, + "source": [ + "## Read the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eAMWQcO3dk2b" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "fp = \"data/incident_event_log.csv\"\n", + "df = pd.read_csv(fp)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DKTpEAXJdk2g" + }, + "source": [ + "## What are the main characteristics?\n", + "1. What does a sample of the dataset look like?\n", + "2. How many incidents are reported in this dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4HT6rg9hdk2h" + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_3onCkdsdk2m" + }, + "outputs": [], + "source": [ + "df['number'].nunique()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QQkLOWkFdk2q" + }, + "source": [ + "## List the data types of the various attributes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wspAzvw6dk2r" + }, + "outputs": [], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bc6SO7Jkdk2v" + }, + "source": [ + "## Convert the $\\texttt{sys_updated_at}$ attribute to be a timestamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3HpATu15dk2w" + }, + "outputs": [], + "source": [ + "df['sys_updated_at'] = pd.to_datetime(df['sys_updated_at'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yGeJtQ94dk20" + }, + "source": [ + "## Machine Learning Task for this Dataset " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n3EsAxURdk20" + }, + "source": [ + "The contributors of this dataset have used this data to predict the time to resolution of the ticket. This data has been used for a classification task in this work. A [graph convolutional network for relational data(GCN)](https://arxiv.org/abs/1703.06103) will be the machine learning task for this work. We will be using a __GCN__ to predict the property of a particular node. What property would be useful to predict ? What are the characteristics of this property in the data? The cells below explore these questions. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yGxcIAlIdk21" + }, + "source": [ + "### Explore candidate list of tags\n", + "Note: For the experiment, we will pick a tag that is fairly evenly distributed in the data. This will avoild the imbalanced classs label problem." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ofWeKiuWdk22" + }, + "outputs": [], + "source": [ + "dfcc = df[['made_sla', 'urgency', 'impact', 'reassignment_count']]\n", + "for c in dfcc.columns.tolist():\n", + " print(str(dfcc[c].value_counts()))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gKV5JEUUdk28" + }, + "source": [ + "A review of the level counts of the categorical variables in this dataset suggest that $\\texttt{made_sla}$ and $\\texttt{urgency}$ are both highly imbalanced. The minority levels are almost anomalies. The $\\texttt{reassignment_count}$ seems promising. We can derive a new attribute $\\texttt{reassigned}$ that captures if the ticket has been reassigned, i.e., has it been assigned to someone after the initial assignment. Such an attribute captures inefficiencies in triaging the ticket and is a useful indicator to track for an organization. A $0$ for this attribute indicates that there was no reassignment and a $1$ indicates that there was a reassignment. This attribute has a nice even spread in the data, i.e., an almost even spread of $0$ and $1$. The cells below create this attribute" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CYGI6SW5dk29" + }, + "source": [ + "## Feature Creation (reassigned):\n", + "It looks like tracking ticket reassignment can create a variable that is somewhat evenly distributed in the data. About half the tickets have the correct assignment at first. About half are reassigned to various degrees." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XgFfomD6dk2-" + }, + "outputs": [], + "source": [ + "df['reassigned'] = df['reassignment_count'].apply(lambda x: 0 if x == 0 else 1)\n", + "df['reassigned'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nb44FIQZdk3B" + }, + "outputs": [], + "source": [ + "dfpp = df.loc[df.groupby(by=['number']).sys_updated_at.idxmax()]\n", + "dfpp = dfpp.reset_index()\n", + "cols = dfpp.columns.tolist()\n", + "cols.remove('index')\n", + "cols.remove('number')\n", + "dfpp = dfpp[cols]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sFYjOPcRdk3E" + }, + "source": [ + "Now that we have characterized the data and identified the machine learning task to be performed. The next step is to transform the data to a form amenable for machine learning. " + ] + } + ], + "metadata": { + "colab": { + "name": "ITSM_EDA.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/examples/batch_graph_pre_processing.ipynb b/examples/batch_graph_pre_processing.ipynb index 1d691cc2..f33a1ca5 100644 --- a/examples/batch_graph_pre_processing.ipynb +++ b/examples/batch_graph_pre_processing.ipynb @@ -1,474 +1,415 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - }, - "colab": { - "name": "batch_graph_pre_processing.ipynb", - "provenance": [], - "include_colab_link": true - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "teHHQQILe_dS", - "colab_type": "text" - }, - "source": [ - "## Get Raw Data for Processing" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "G-oxhBfae8Ae", - "colab_type": "code", - "colab": {} - }, - "source": [ - "%%capture\n", - "!git clone -b doc_updates_nx https://github.com/arangoml/networkx-adapter.git\n", - "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n", - "!pip3 install networkx\n", - "!pip3 install matplotlib" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KzKGdXzUenuM", - "colab_type": "text" - }, - "source": [ - "## Preprocessing ITSM Data\n", - "The purpose of this notebook is to prepare the data in a format suitable for machine learning. The dataset consists of a few numerical and many categorical attributes. The numerical attributes are discretized. The embedding for the categorical values is developed similar to developing embeddings for words in NLP (see https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html). Each categorical value is mapped to a unique integer. The encoded data that is presented to the embedding layer is a sequence of integers, with each integer corresponding to a word, This notebook performs this mapping. It also encodes unknown values to a 'UNKNOWN' category." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "bkRP-4eBenuN", - "colab_type": "code", - "colab": {} - }, - "source": [ - "import pandas as pd\n", - "fp = \"data/incident_event_log.csv\"\n", - "df = pd.read_csv(fp)\n", - "df['reassigned'] = df['reassigned'] = df['reassignment_count'].apply(lambda x: 0 if x == 0 else 1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YGEszr3GenuT", - "colab_type": "text" - }, - "source": [ - "## Discretize the numerical attributes" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "d6mTJ90venuU", - "colab_type": "code", - "colab": {} - }, - "source": [ - "numeric = ['sys_mod_count', 'reopen_count']\n", - "dfn = df[numeric]\n", - "dcols = []\n", - "for col in numeric:\n", - " dlabel = 'D_' + col\n", - " labels = [dlabel +'_' + str(c) for c in range(5)]\n", - " dcols.append(dlabel)\n", - " dfn[dlabel] = pd.qcut(dfn[col].rank(method='first'),5, labels = labels, duplicates = 'drop')\n", - " " - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "b_EF79pOenuZ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dfn.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lE815N7venud", - "colab_type": "text" - }, - "source": [ - "## Isolate the attributes used for the analysis \n", - "1. Remove the timestamp attributes\n", - "2. Remove the numeric attributes. The discretized version of these attributes is added subsequently" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "cxUt-5Enenue", - "colab_type": "code", - "colab": {} - }, - "source": [ - "attributes = df.columns.tolist()\n", - "remove = [ 'made_sla', 'opened_at', 'resolved_at','sys_created_at', 'caller_id', 'closed_at',\\\n", - " 'notify', 'sys_updated_by','sys_created_by', 'number', 'sys_updated_at', 'reassigned' ]\n", - "exclude = remove + numeric\n", - "keep = list(set(attributes) - set(exclude)) \n", - "keep" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "DbVuZhgsenuk", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_cat_vars = df[keep]\n", - "df_cat_vars = df_cat_vars.replace(to_replace = '?', value = 'UNKNOWN')\n", - "df_cat_vars = pd.concat([df_cat_vars, dfn[dcols]], axis = 1)\n", - "df['made_sla'] = df['made_sla'].map({True: 1, False: 0})\n", - "\n", - "df = df.reset_index()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "Vto4QKbxenuo", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df['reassigned'].value_counts()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "ogJ0oAlSenus", - "colab_type": "code", - "colab": {} - }, - "source": [ - "cols = df_cat_vars.columns.tolist()\n", - "vocab_size = 0\n", - "for c in cols:\n", - " print(\"Num unique vals for category \" + str(c) + \" = \" + str(df_cat_vars[c].nunique()))\n", - " vocab_size += df_cat_vars[c].nunique()\n", - "print(\"Vocab size: %s\" % vocab_size)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "juRDL5_lenuw", - "colab_type": "text" - }, - "source": [ - "## Recode the categorical values to integers" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "oVM1rplIenux", - "colab_type": "code", - "colab": {} - }, - "source": [ - "UNKNOWN_VAL = 1\n", - "cat_cols = df_cat_vars.columns.tolist()\n", - "cat_int_map = {col: dict() for col in cat_cols}\n", - "int_index = 2\n", - "for c in cat_cols:\n", - " unique_col_values = df_cat_vars[c].unique().tolist()\n", - " col_int_map = cat_int_map[c]\n", - " for uv in unique_col_values:\n", - " if uv == 'UNKNOWN':\n", - " col_int_map[uv] = UNKNOWN_VAL\n", - " else:\n", - " col_int_map[uv] = int_index\n", - " int_index +=1\n", - " df_cat_vars[c] = df_cat_vars[c].map(cat_int_map[c]) " - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "lH6qzefOenu0", - "colab_type": "code", - "colab": {} - }, - "source": [ - "combined_cat_int_map = dict()\n", - "for col in cat_int_map.keys():\n", - " for cat_val, int_map in cat_int_map[col].items():\n", - " combined_cat_int_map[cat_val] = int_map\n", - " " - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jH9uA2IVenu3", - "colab_type": "text" - }, - "source": [ - "## Write preprocessed raw data to disk" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "kWMHA7eSenu4", - "colab_type": "code", - "colab": {} - }, - "source": [ - "fp_cat_int_map = \"data/category_to_integer_map.csv\"\n", - "df_map = pd.DataFrame(combined_cat_int_map, index = [0])\n", - "df_map = df_map.T\n", - "df_map = df_map.reset_index()\n", - "df_map.columns = [\"cat_value\", \"assigned_integer\"]\n", - "df_map.to_csv(fp_cat_int_map, index = False)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qzbT2d08enu7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "add_to_cat_vars = ['number','sys_updated_at', 'reassigned'] \n", - "df = pd.concat([df[add_to_cat_vars], df_cat_vars], axis = 1)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "wDA8_L5-enu-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df['sys_updated_at'] = pd.to_datetime(df['sys_updated_at']) " - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qN8WD2teenvB", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df['sys_updated_at'].dtype" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "93OcLjSVenvE", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df = df.sort_values(by = ['number', 'sys_updated_at'])" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "WK00OhfzenvH", - "colab_type": "code", - "colab": {} - }, - "source": [ - "fp = 'data/pp_batch_incident_event_log.csv'\n", - "df.to_csv(fp, index = False)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "0DcHNc36envK", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df['reassigned'].value_counts()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "0GqTrpLVenvM", - "colab_type": "code", - "colab": {} - }, - "source": [ - "int_index" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0n0Jwy-OenvP", - "colab_type": "text" - }, - "source": [ - "## Generate data for learning\n", - "The data used for learning has the raw data summarized by incident, i.e. , the raw data for each incident is grouped and summarized. A sample of the data used for learning can be viewed." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "odZ5S1XienvQ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "dfgb = df.groupby(by = ['number'])\n", - "df_pp = df.loc[dfgb.sys_updated_at.idxmax()]\n", - "df_pp = df_pp.reset_index()\n", - "cols = df_pp.columns.tolist()\n", - "cols.remove('index')\n", - "df_pp = df_pp[cols]\n", - "fprp = \"data/pp_recoded_incident_event_log.csv\"\n", - "df_pp.to_csv(fprp, index = False)" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "jZ_27dx5envT", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_pp['reassigned'].value_counts()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "NnNYbdqzenvX", - "colab_type": "code", - "colab": {} - }, - "source": [ - "df_pp.head()" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "1Bi1Wyaienvb", - "colab_type": "code", - "colab": {} - }, - "source": [ - "vocab_size" - ], - "execution_count": 0, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "p77zo3FUenve", - "colab_type": "code", - "colab": {} - }, - "source": [ - "" - ], - "execution_count": 0, - "outputs": [] - } - ] -} \ No newline at end of file + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "teHHQQILe_dS" + }, + "source": [ + "## Get Raw Data for Processing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "G-oxhBfae8Ae" + }, + "outputs": [], + "source": [ + "%%capture\n", + "!git clone -b 0.0.0.2.5.3 https://github.com/arangoml/networkx-adapter.git\n", + "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n", + "!pip3 install networkx\n", + "!pip3 install matplotlib" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KzKGdXzUenuM" + }, + "source": [ + "## Preprocessing ITSM Data\n", + "The purpose of this notebook is to prepare the data in a format suitable for machine learning. The dataset consists of a few numerical and many categorical attributes. The numerical attributes are discretized. The embedding for the categorical values is developed similar to developing embeddings for words in NLP (see https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html). Each categorical value is mapped to a unique integer. The encoded data that is presented to the embedding layer is a sequence of integers, with each integer corresponding to a word, This notebook performs this mapping. It also encodes unknown values to a 'UNKNOWN' category." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bkRP-4eBenuN" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "fp = \"data/incident_event_log.csv\"\n", + "df = pd.read_csv(fp)\n", + "df['reassigned'] = df['reassigned'] = df['reassignment_count'].apply(lambda x: 0 if x == 0 else 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YGEszr3GenuT" + }, + "source": [ + "## Discretize the numerical attributes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d6mTJ90venuU" + }, + "outputs": [], + "source": [ + "numeric = ['sys_mod_count', 'reopen_count']\n", + "dfn = df[numeric]\n", + "dcols = []\n", + "for col in numeric:\n", + " dlabel = 'D_' + col\n", + " labels = [dlabel +'_' + str(c) for c in range(5)]\n", + " dcols.append(dlabel)\n", + " dfn[dlabel] = pd.qcut(dfn[col].rank(method='first'),5, labels = labels, duplicates = 'drop')\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "b_EF79pOenuZ" + }, + "outputs": [], + "source": [ + "dfn.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lE815N7venud" + }, + "source": [ + "## Isolate the attributes used for the analysis \n", + "1. Remove the timestamp attributes\n", + "2. Remove the numeric attributes. The discretized version of these attributes is added subsequently" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cxUt-5Enenue" + }, + "outputs": [], + "source": [ + "attributes = df.columns.tolist()\n", + "remove = [ 'made_sla', 'opened_at', 'resolved_at','sys_created_at', 'caller_id', 'closed_at',\\\n", + " 'notify', 'sys_updated_by','sys_created_by', 'number', 'sys_updated_at', 'reassigned' ]\n", + "exclude = remove + numeric\n", + "keep = list(set(attributes) - set(exclude)) \n", + "keep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DbVuZhgsenuk" + }, + "outputs": [], + "source": [ + "df_cat_vars = df[keep]\n", + "df_cat_vars = df_cat_vars.replace(to_replace = '?', value = 'UNKNOWN')\n", + "df_cat_vars = pd.concat([df_cat_vars, dfn[dcols]], axis = 1)\n", + "df['made_sla'] = df['made_sla'].map({True: 1, False: 0})\n", + "\n", + "df = df.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Vto4QKbxenuo" + }, + "outputs": [], + "source": [ + "df['reassigned'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ogJ0oAlSenus" + }, + "outputs": [], + "source": [ + "cols = df_cat_vars.columns.tolist()\n", + "vocab_size = 0\n", + "for c in cols:\n", + " print(\"Num unique vals for category \" + str(c) + \" = \" + str(df_cat_vars[c].nunique()))\n", + " vocab_size += df_cat_vars[c].nunique()\n", + "print(\"Vocab size: %s\" % vocab_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "juRDL5_lenuw" + }, + "source": [ + "## Recode the categorical values to integers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oVM1rplIenux" + }, + "outputs": [], + "source": [ + "UNKNOWN_VAL = 1\n", + "cat_cols = df_cat_vars.columns.tolist()\n", + "cat_int_map = {col: dict() for col in cat_cols}\n", + "int_index = 2\n", + "for c in cat_cols:\n", + " unique_col_values = df_cat_vars[c].unique().tolist()\n", + " col_int_map = cat_int_map[c]\n", + " for uv in unique_col_values:\n", + " if uv == 'UNKNOWN':\n", + " col_int_map[uv] = UNKNOWN_VAL\n", + " else:\n", + " col_int_map[uv] = int_index\n", + " int_index +=1\n", + " df_cat_vars[c] = df_cat_vars[c].map(cat_int_map[c]) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lH6qzefOenu0" + }, + "outputs": [], + "source": [ + "combined_cat_int_map = dict()\n", + "for col in cat_int_map.keys():\n", + " for cat_val, int_map in cat_int_map[col].items():\n", + " combined_cat_int_map[cat_val] = int_map\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jH9uA2IVenu3" + }, + "source": [ + "## Write preprocessed raw data to disk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kWMHA7eSenu4" + }, + "outputs": [], + "source": [ + "fp_cat_int_map = \"data/category_to_integer_map.csv\"\n", + "df_map = pd.DataFrame(combined_cat_int_map, index = [0])\n", + "df_map = df_map.T\n", + "df_map = df_map.reset_index()\n", + "df_map.columns = [\"cat_value\", \"assigned_integer\"]\n", + "df_map.to_csv(fp_cat_int_map, index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qzbT2d08enu7" + }, + "outputs": [], + "source": [ + "add_to_cat_vars = ['number','sys_updated_at', 'reassigned'] \n", + "df = pd.concat([df[add_to_cat_vars], df_cat_vars], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wDA8_L5-enu-" + }, + "outputs": [], + "source": [ + "df['sys_updated_at'] = pd.to_datetime(df['sys_updated_at']) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qN8WD2teenvB" + }, + "outputs": [], + "source": [ + "df['sys_updated_at'].dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "93OcLjSVenvE" + }, + "outputs": [], + "source": [ + "df = df.sort_values(by = ['number', 'sys_updated_at'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WK00OhfzenvH" + }, + "outputs": [], + "source": [ + "fp = 'data/pp_batch_incident_event_log.csv'\n", + "df.to_csv(fp, index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0DcHNc36envK" + }, + "outputs": [], + "source": [ + "df['reassigned'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0GqTrpLVenvM" + }, + "outputs": [], + "source": [ + "int_index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0n0Jwy-OenvP" + }, + "source": [ + "## Generate data for learning\n", + "The data used for learning has the raw data summarized by incident, i.e. , the raw data for each incident is grouped and summarized. A sample of the data used for learning can be viewed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "odZ5S1XienvQ" + }, + "outputs": [], + "source": [ + "dfgb = df.groupby(by = ['number'])\n", + "df_pp = df.loc[dfgb.sys_updated_at.idxmax()]\n", + "df_pp = df_pp.reset_index()\n", + "cols = df_pp.columns.tolist()\n", + "cols.remove('index')\n", + "df_pp = df_pp[cols]\n", + "fprp = \"data/pp_recoded_incident_event_log.csv\"\n", + "df_pp.to_csv(fprp, index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jZ_27dx5envT" + }, + "outputs": [], + "source": [ + "df_pp['reassigned'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NnNYbdqzenvX" + }, + "outputs": [], + "source": [ + "df_pp.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Bi1Wyaienvb" + }, + "outputs": [], + "source": [ + "vocab_size" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "p77zo3FUenve" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "name": "batch_graph_pre_processing.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}