diff --git a/bipartite_clustering.pdf b/bipartite_clustering.pdf
new file mode 100644
index 00000000..53b67ff7
Binary files /dev/null and b/bipartite_clustering.pdf differ
diff --git a/examples/IMDB_Networkx_Adapter.ipynb b/examples/IMDB_Networkx_Adapter.ipynb
index 3c3bc28f..f1be0a44 100644
--- a/examples/IMDB_Networkx_Adapter.ipynb
+++ b/examples/IMDB_Networkx_Adapter.ipynb
@@ -1,586 +1,1219 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "name": "IMDB_Networkx_Adapter.ipynb",
- "provenance": [],
- "include_colab_link": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "view-in-github",
- "colab_type": "text"
- },
- "source": [
- ""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "VLfArBaMRTiM",
- "colab_type": "text"
- },
- "source": [
- "# Using Node2Vec to Embed Viewers of a Movie using the ArangoDB IMDB NetworkX Adapter"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Wd0_wS6TRTiO",
- "colab_type": "text"
- },
- "source": [
- "This notebook provides the details of using the ArangoDB IMDB NetworkX adapter to develop a _node2vec_ embedding of the viewers of a movie from the _IMDB_ database. "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "NwlgpvAcRTiP",
- "colab_type": "text"
- },
- "source": [
- "## Install Required Libraries "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "colab_type": "text",
- "id": "view-in-github"
- },
- "source": [
- ""
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "ZMtTZ5Oth0N-",
- "colab": {}
- },
- "source": [
- "%%capture\n",
- "!git clone -b doc_updates_nx https://github.com/arangoml/networkx-adapter.git\n",
- "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n",
- "!pip3 install networkx\n",
- "!pip3 install matplotlib\n",
- "!pip3 install --index-url https://test.pypi.org/simple/ adbnx-adapter==0.0.0.2.5.3\n",
- "!pip3 install pyarango\n",
- "!pip3 install python-arango\n",
- "!pip install node2vec"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "oBYYw8ttSbBm",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- "!rm creds.dat\n",
- "!touch creds.dat"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "BRAQ2O7fRTiX",
- "colab_type": "text"
- },
- "source": [
- "## Get a Oasis Connection"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "kTLmib2BRTiY",
- "colab_type": "text"
- },
- "source": [
- "__Oasis__, the managed database service offering from ArangoDB, will be used for this exercise. This eliminates the need for setting up and configuring an instance of a database."
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "9voIoaGRS0cB",
- "colab": {}
- },
- "source": [
- "from adbnx_adapter.imdb_arangoDB_networkx_adapter import IMDBArangoDB_Networkx_Adapter\n",
- "import oasis\n",
- "con = oasis.getTempCredentials()\n",
- "\n",
- "print()\n",
- "print(\"https://{}:{}\".format(con[\"hostname\"], con[\"port\"]))\n",
- "print(\"Username: \" + con[\"username\"])\n",
- "print(\"Password: \" + con[\"password\"])\n",
- "print(\"Database: \" + con[\"dbName\"])\n",
- "\n",
- "\n",
- "ma = IMDBArangoDB_Networkx_Adapter(conn=con)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "APNXpksoRTif",
- "colab_type": "text"
- },
- "source": [
- "## Create the Collections for the Database "
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "tDmWTWYkW6VW",
- "colab": {}
- },
- "source": [
- "import csv\n",
- "import json\n",
- "import requests\n",
- "import sys\n",
- "import oasis\n",
- "\n",
- "\n",
- "from pyArango.connection import *\n",
- "from pyArango.collection import Collection, Edges, Field\n",
- "from pyArango.graph import Graph, EdgeDefinition\n",
- "from pyArango.collection import BulkOperation as BulkOperation"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "RQAoZi3AW9ru",
- "colab": {}
- },
- "source": [
- "# Connect to the temp database\n",
- "conn = oasis.connect_pyarango(con)\n",
- "db = conn[con[\"dbName\"]]"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "crgZqALFXEoi",
- "colab": {}
- },
- "source": [
- "from pyArango.collection import Collection, Field\n",
- "from pyArango.graph import Graph, EdgeDefinition\n",
- "\n",
- "\n",
- "class Users(Collection):\n",
- " _fields = {\n",
- " \"user_id\": Field(),\n",
- " # \"age\": Field(),\n",
- " # \"gender\": Field()\n",
- " }\n",
- "\n",
- "\n",
- "class Movies(Collection):\n",
- " _fields = {\n",
- " \"movie_id\": Field(),\n",
- " # \"movie_title\": Field(),\n",
- " # \"release_data\": Field()\n",
- " }\n",
- "\n",
- "\n",
- "class Ratings(Edges):\n",
- " _fields = {\n",
- " # user_id and item_id are encoded by _from, _to\n",
- " \"rating\": Field(),\n",
- " # \"timestamp\": Field()\n",
- " }\n",
- "\n",
- "\n",
- "class IMDBGraph(Graph):\n",
- " _edgeDefinitions = [EdgeDefinition(\"Ratings\", fromCollections=[\n",
- " \"Users\"], toCollections=[\"Movies\"])]\n",
- " _orphanedCollections = []\n",
- "\n",
- "\n",
- "db.createCollection(\"Users\")\n",
- "db.createCollection(\"Movies\")\n",
- "db.createCollection(\"Ratings\")\n",
- "iMDBGraph = db.createGraph(\"IMDBGraph\")\n",
- "\n",
- "print(\"Collection/Graph Setup done.\")"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "f0nSSiRnRTit",
- "colab_type": "text"
- },
- "source": [
- "## Load the Data "
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "neURfa3jXPs5",
- "colab": {}
- },
- "source": [
- "collection = db[\"Users\"]\n",
- "with BulkOperation(collection, batchSize=100) as col:\n",
- " with open('data/users.csv', newline='') as csvfile:\n",
- " reader = csv.reader(csvfile, delimiter=',', quotechar='|')\n",
- " # Skip header\n",
- " next(reader)\n",
- " for row in reader:\n",
- " user_id, age, gender, occupation, zip = tuple(row)\n",
- " doc = col.createDocument()\n",
- " doc[\"_key\"] = user_id\n",
- "# doc[\"age\"] = age\n",
- "# doc[\"gender\"] = gender\n",
- " doc.save()\n",
- "\n",
- "collection = db[\"Movies\"]\n",
- "with BulkOperation(collection, batchSize=100) as col:\n",
- " with open('data/movies.csv', newline='') as csvfile:\n",
- " reader = csv.reader(csvfile, delimiter=',', quotechar='|')\n",
- " # Skip header\n",
- " next(reader)\n",
- " for row in reader:\n",
- " movie_id, movie_title, release_date, video_release_date, url, unknown, action, adventure, animation, childrens, comedy, crime, documentary, drama, fantasy, noir, horror, musical, mystery, romance, scifi, thriller, war, western = tuple(\n",
- " row)\n",
- " doc = col.createDocument()\n",
- " doc[\"_key\"] = movie_id\n",
- "# doc[\"movie_title\"] = movie_title\n",
- "# doc[\"release_date\"] = release_date\n",
- " doc.save()\n",
- "\n",
- "collection = db[\"Ratings\"]\n",
- "with BulkOperation(collection, batchSize=1000) as col:\n",
- " with open('data/ratings.csv', newline='') as csvfile:\n",
- " reader = csv.reader(csvfile, delimiter=',', quotechar='|')\n",
- " # Skip header\n",
- " next(reader)\n",
- " for row in reader:\n",
- " user_id, movie_id, rating, timestamp = tuple(row)\n",
- " doc = col.createDocument()\n",
- " doc[\"_from\"] = \"Users/\"+user_id\n",
- " doc[\"_to\"] = \"Movies/\"+movie_id\n",
- " doc[\"ratings\"] = rating\n",
- "# doc[\"timestamp\"] = timestamp\n",
- " doc.save()\n",
- "\n",
- "print(\"Import Done\")"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "tua4CIHYRTi0",
- "colab_type": "text"
- },
- "source": [
- "## Specify the Graph Structure "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "9oKkkRxwRTi0",
- "colab_type": "text"
- },
- "source": [
- "To use the IMDB Networkx Adapter, we need to specify the structure of the graph that we want to create. This is done with a simple dictionary. The details of creating the _Networkx_ graph are shown below."
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "wobwe8KqXXi2",
- "colab": {}
- },
- "source": [
- "imdb_attributes = {'vertexCollections': {'Users': {},\n",
- " 'Movies': {}},\n",
- " 'edgeCollections': {'Ratings': {'_from', '_to', 'ratings'}}}"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "jjJZJEgvXZ0W",
- "colab": {}
- },
- "source": [
- "g = ma.create_networkx_graph(\n",
- " graph_name='IMDBGraph', graph_attributes=imdb_attributes)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-ZXT-keARTi9",
- "colab_type": "text"
- },
- "source": [
- "## Inspect the 'Users' and 'Movies' Nodes"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "nWnsHYwAXgyf",
- "colab": {}
- },
- "source": [
- "g.nodes['Users/2']"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "dLAsXY-3XkBY",
- "colab": {}
- },
- "source": [
- "g.nodes['Movies/4']"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "8Zf6uWCFRTjG",
- "colab_type": "text"
- },
- "source": [
- "## Who are the viewers of 'Movies/4' ('Get Shorty') "
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "x1vtgYbAXpec",
- "colab": {}
- },
- "source": [
- "m4v = [t[0] for t in g.in_edges('Movies/4')]"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "y4W1MtL-RTjK",
- "colab_type": "text"
- },
- "source": [
- "## How similar are viewers of the movie 'Get Shorty'?\n",
- "The __Jaccard__ similarity is used for this purpose. We first get all pairs of users who have seen the movie and then compute the __Jaccard__ similarity between them. The details are shown below"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "z4WoMVchXtVq",
- "colab": {}
- },
- "source": [
- "from itertools import combinations\n",
- "m4vucmb = list(combinations(m4v, 2))"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "3Z4VBUvkXyHl",
- "colab": {}
- },
- "source": [
- "import networkx as nx\n",
- "gp = g.to_undirected()\n",
- "jcp = nx.jaccard_coefficient(gp, m4vucmb)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "vo538HEmRTjU",
- "colab_type": "text"
- },
- "source": [
- "## Create a complete sub-graph for the viewers of 'Get Shorty' using the Jaccard Simlarity for the edge weights"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "f2i7Puh9X14N",
- "colab": {}
- },
- "source": [
- "gs = nx.DiGraph()\n",
- "for u, v, p in jcp:\n",
- " gs.add_edge(u, v, weight=p)\n",
- " #print('(%s, %s) -> %.8f' % (u, v, p))"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "n0L-tUzXRTjY",
- "colab_type": "text"
- },
- "source": [
- "## How many edges does the complete sub-graph have?"
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "MhRMcp_8X582",
- "colab": {}
- },
- "source": [
- "gs.number_of_edges()"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Jv5XKBLfRTjd",
- "colab_type": "text"
- },
- "source": [
- "## Embed the sub-graph using Node2vec "
- ]
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "e-fD-eNhX9Lh",
- "colab": {}
- },
- "source": [
- "from node2vec import Node2Vec\n",
- "node2vec = Node2Vec(gs, dimensions=64, walk_length=30, num_walks=200, workers=4)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "6hu6Z0VPYDFK",
- "colab": {}
- },
- "source": [
- "model = node2vec.fit(window=10, min_count=1, batch_words=4)"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "colab_type": "code",
- "id": "41YVYl3jYHia",
- "colab": {}
- },
- "source": [
- "model.wv.most_similar(m4v[5])"
- ],
- "execution_count": 0,
- "outputs": []
- },
- {
- "cell_type": "code",
- "metadata": {
- "id": "C8jFOSabRTjm",
- "colab_type": "code",
- "colab": {}
- },
- "source": [
- ""
- ],
- "execution_count": 0,
- "outputs": []
- }
- ]
-}
\ No newline at end of file
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "view-in-github"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "VLfArBaMRTiM"
+ },
+ "source": [
+ "# Analysis of IMDB data using the ArangoDB Netoworkx adapter "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Wd0_wS6TRTiO"
+ },
+ "source": [
+ "This notebook will illustrate how the Networkx adapter can be used to perform graph analytics tasks on graph data stored in ArangoDB. For this example we will use the IMDB movie review dataset. The dataset provides reviews of various movies by viewers. The details of the dataset are available [here](https://www.imdb.com/interfaces/). If we use a relational representation to analyze this problem, then we will need to know what we want to look in the data and then seek it from the data. For example we may be interested in the average rating for a particular genre of movies. The point here is that we need to know the questions that are important in the analysis of this problem apriori. When we use a graph representation of this data, then we can leverage the body of ideas used in network analysis (or social network analysis) to extract insights from the data. The body of ideas are the standard questions we seek answers to when analyzing any network. The answers to these questions will provide us insights about the data and the problem. In other words, we have a framework for extracting insights from the data. In the example provided in this notebook, we will use ideas from the analysis of bipartite graphs to illustrate this point. Of course, if we have a specific question that we are seeking the answer for, then that is available easily from the graph as well. Therefore, going with a graph represenation makes it easier to extract insights from the data.t\n",
+ "\n",
+ "The advantages of using a graph representation for the data extend beyond mining and learning from the data. Running adhoc queries for analytic purposes on the data is more efficient with a graph representation. With a relational representation, we can only optimize query performance for queries that are known before hand (with indexes). Adhoc queries could require multiple joins and have poor performance. Most graphs, have the so called _small world effect_. In such graphs, most nodes can be reached from most other nodes with a small number of edge traversals. An adhoc query starting at any node can reach the node of interest in a few hops.\n",
+ "\n",
+ "The illustration in this notebook is organized as follows:\n",
+ "1. Loading the data for the example.\n",
+ "2. Creating the Networkx graph from the AranogoDB graph using the Networkx Adapter.\n",
+ "3. Analysis of the graph using Networkx.\n",
+ "4. Using the Networkx graph to obtain a graph embedding using _Node2vec_.\n",
+ "5. Using Arangopipe to store metadata about this experiment."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Loading the Data into ArangoDB"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We will use Oasis, ArangoDB's managed service offering, for this example. We will start with installing the required libraries to run this example. We will then connect to Oasis, obtain a connection and create the database.After creating the database, we will load the data into ArangoDB. The next few cells illustrate these steps."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "NwlgpvAcRTiP"
+ },
+ "source": [
+ "## Install Required Libraries "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "view-in-github"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "ZMtTZ5Oth0N-"
+ },
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "!git clone -b narrative_addition https://github.com/arangoml/networkx-adapter.git\n",
+ "!rsync -av networkx-adapter/examples/ ./ --exclude=.git\n",
+ "!pip3 install networkx\n",
+ "!pip3 install matplotlib\n",
+ "!pip3 install --index-url https://test.pypi.org/simple/ adbnx-adapter==0.0.0.2.5.3\n",
+ "!pip3 install pyarango\n",
+ "!pip3 install python-arango\n",
+ "!pip install node2vec\n",
+ "!pip install seaborn\n",
+ "!pip install arangopipe==0.0.6.9.3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "oBYYw8ttSbBm"
+ },
+ "outputs": [],
+ "source": [
+ "!rm creds.dat\n",
+ "!touch creds.dat"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "BRAQ2O7fRTiX"
+ },
+ "source": [
+ "## Get a Oasis Connection"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "kTLmib2BRTiY"
+ },
+ "source": [
+ "__Oasis__, the managed database service offering from ArangoDB, will be used for this exercise. This eliminates the need for setting up and configuring an instance of a database."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "9voIoaGRS0cB"
+ },
+ "outputs": [],
+ "source": [
+ "from adbnx_adapter.imdb_arangoDB_networkx_adapter import IMDBArangoDB_Networkx_Adapter\n",
+ "import oasis\n",
+ "con = oasis.getTempCredentials()\n",
+ "\n",
+ "print()\n",
+ "print(\"https://{}:{}\".format(con[\"hostname\"], con[\"port\"]))\n",
+ "print(\"Username: \" + con[\"username\"])\n",
+ "print(\"Password: \" + con[\"password\"])\n",
+ "print(\"Database: \" + con[\"dbName\"])\n",
+ "\n",
+ "\n",
+ "ma = IMDBArangoDB_Networkx_Adapter(conn=con)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "APNXpksoRTif"
+ },
+ "source": [
+ "## Create the Collections for the Database "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "RQAoZi3AW9ru"
+ },
+ "outputs": [],
+ "source": [
+ "# Connect to the temp database\n",
+ "conn = oasis.connect(con)\n",
+ "db = conn[con[\"dbName\"]]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "tDmWTWYkW6VW"
+ },
+ "outputs": [],
+ "source": [
+ "import csv\n",
+ "import json\n",
+ "import requests\n",
+ "import sys\n",
+ "import oasis\n",
+ "\n",
+ "\n",
+ "from pyArango.connection import *\n",
+ "from pyArango.collection import Collection, Edges, Field\n",
+ "from pyArango.graph import Graph, EdgeDefinition\n",
+ "from pyArango.collection import BulkOperation as BulkOperation\n",
+ "from pyArango.collection import Collection, Field\n",
+ "from pyArango.graph import Graph, EdgeDefinition\n",
+ "\n",
+ "\n",
+ "class Users(Collection):\n",
+ " _fields = {\n",
+ " \"user_id\": Field(),\n",
+ " \"age\": Field(),\n",
+ " \"gender\": Field()\n",
+ " }\n",
+ " \n",
+ "class Movies(Collection):\n",
+ " _fields = {\n",
+ " \"movie_id\": Field(),\n",
+ " \"movie_title\": Field(),\n",
+ " \"release_data\": Field()\n",
+ " }\n",
+ "\n",
+ "class Ratings(Edges): \n",
+ " _fields = {\n",
+ " #user_id and item_id are encoded by _from, _to \n",
+ " \"rating\": Field(),\n",
+ " \"timestamp\": Field()\n",
+ " }\n",
+ "\n",
+ "class IMDBGraph(Graph) :\n",
+ " _edgeDefinitions = [EdgeDefinition(\"Ratings\", fromCollections=[\"Users\"], toCollections=[\"Movies\"])]\n",
+ " _orphanedCollections = []\n",
+ "\n",
+ "db.createCollection(\"Users\")\n",
+ "db.createCollection(\"Movies\")\n",
+ "db.createCollection(\"Ratings\")\n",
+ "iMDBGraph = db.createGraph(\"IMDBGraph\")\n",
+ "\n",
+ "print(\"Collection/Graph Setup done.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "f0nSSiRnRTit"
+ },
+ "source": [
+ "## Import the Data "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "neURfa3jXPs5"
+ },
+ "outputs": [],
+ "source": [
+ "collection = db[\"Users\"]\n",
+ "with BulkOperation(collection, batchSize=100) as col:\n",
+ " with open('data/users.csv', newline='') as csvfile:\n",
+ " reader = csv.reader(csvfile, delimiter=',', quotechar='|')\n",
+ " # Skip header\n",
+ " next(reader)\n",
+ " for row in reader:\n",
+ " user_id, age, gender, occupation, zip = tuple(row)\n",
+ " doc = col.createDocument()\n",
+ " doc[\"_key\"] = user_id\n",
+ "# doc[\"age\"] = age\n",
+ "# doc[\"gender\"] = gender\n",
+ " doc.save()\n",
+ "\n",
+ "collection = db[\"Movies\"]\n",
+ "with BulkOperation(collection, batchSize=100) as col:\n",
+ " with open('data/movies.csv', newline='') as csvfile:\n",
+ " reader = csv.reader(csvfile, delimiter=',', quotechar='|')\n",
+ " # Skip header\n",
+ " next(reader)\n",
+ " for row in reader:\n",
+ " movie_id, movie_title, release_date, video_release_date, url, unknown, action, adventure, animation, childrens, comedy, crime, documentary, drama, fantasy, noir, horror, musical, mystery, romance, scifi, thriller, war, western = tuple(\n",
+ " row)\n",
+ " doc = col.createDocument()\n",
+ " doc[\"_key\"] = movie_id\n",
+ "# doc[\"movie_title\"] = movie_title\n",
+ "# doc[\"release_date\"] = release_date\n",
+ " doc.save()\n",
+ "\n",
+ "collection = db[\"Ratings\"]\n",
+ "with BulkOperation(collection, batchSize=1000) as col:\n",
+ " with open('data/ratings.csv', newline='') as csvfile:\n",
+ " reader = csv.reader(csvfile, delimiter=',', quotechar='|')\n",
+ " # Skip header\n",
+ " next(reader)\n",
+ " for row in reader:\n",
+ " user_id, movie_id, rating, timestamp = tuple(row)\n",
+ " doc = col.createDocument()\n",
+ " doc[\"_from\"] = \"Users/\"+user_id\n",
+ " doc[\"_to\"] = \"Movies/\"+movie_id\n",
+ " doc[\"ratings\"] = rating\n",
+ "# doc[\"timestamp\"] = timestamp\n",
+ " doc.save()\n",
+ "\n",
+ "print(\"Import Done\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Use the Networkx-Adapter to create a Networkx Graph"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we have loaded the data into ArangoDB, we can use the __Networkx-Adapter__ to create _Networkx_ graphs from the ArangoDB graph. To do so, we'll have to provide a graph descriptor that describes the graph structure to the __Networkx-Adapter__. These steps are illustrated below. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "tua4CIHYRTi0"
+ },
+ "source": [
+ "## Specify the Graph Structure "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "9oKkkRxwRTi0"
+ },
+ "source": [
+ "To use the IMDB Networkx Adapter, we need to specify the structure of the graph that we want to create. This is done with a simple dictionary. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "wobwe8KqXXi2"
+ },
+ "outputs": [],
+ "source": [
+ "imdb_attributes = {'vertexCollections': {'Users': {},\n",
+ " 'Movies': {}},\n",
+ " 'edgeCollections': {'Ratings': {'_from', '_to', 'ratings'}}}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Obtain the networkx graph"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "jjJZJEgvXZ0W"
+ },
+ "outputs": [],
+ "source": [
+ "g = ma.create_networkx_graph(\n",
+ " graph_name='IMDBGraph', graph_attributes=imdb_attributes)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "__Done!__, we now have a _Networkx_ graph representation that we can use for analysis!"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Analysis of the IMDB reviews dataset with Networkx\n",
+ "We just created a networkx graph for this problem. In this notebook we will use a small set of ideas from the standard body of ideas used in network analysis to extract insights from the movie review dataset. Networkx provides an implementation of these ideas. These will be discussed next.\n",
+ "\n",
+ "As mentioned earlier, one of the advantages of using a graph representation is that we can leverage the standard body of ideas used to analyze networks to extract information about this dataset. In what follows, we will call out such facts as we identify them.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Get the user and movie nodes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_nodes = [n for n in g.nodes() if n.startswith(\"Users\")]\n",
+ "movie_nodes = [n for n in g.nodes() if n.startswith(\"Movies\")]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Structural Property Introspection: Number of Nodes and Edges"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"Number of Users are %d\" % (len(user_nodes)))\n",
+ "print(\"Number of Movies are %d\" % (len(movie_nodes)))\n",
+ "print(\"Number of Ratings are %d\" % (len(list(g.edges()))))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Convert the graph obtained from the interface to a bi-partite graph "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import networkx as nx\n",
+ "B = nx.Graph()\n",
+ "B.add_nodes_from(user_nodes, bipartite=0)\n",
+ "B.add_nodes_from(movie_nodes, bipartite=1)\n",
+ "B.add_edges_from(list(g.edges()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Identified fact:\n",
+ "_The graph has two kinds of edges. It has 943 users and and 1682 Movies. A user may watch multiple movies. 65499 movie ratings are available. This information is obtained by simply identifying the structure of the graph._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from networkx.algorithms import bipartite\n",
+ "bipartite.is_bipartite(B)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Is the graph connected?\n",
+ "If the graph is connected then there is a path between any two nodes in the graph. If not, then some nodes are not connected. In the context of this application, a disconnected graph implies that there exist users in the graph who may not have rated a movie in common. If it is true that any given pair of users have rated at least one movie in common, then there is a path connecting these users through the movie they have rated in common."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Identified fact:\n",
+ "_In the user community, if we pick any two users, then it is possible that they may not have rated a movie in common._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nx.is_connected(B)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "users = [n for n in B.nodes if B.nodes[n]['bipartite'] == 0]\n",
+ "movies = [n for n in B.nodes if B.nodes[n]['bipartite'] == 1]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many movies does a User rate?\n",
+ "This is a distribution. A review of the plot below will show the familiar long tailed distribution. Most viewers rate less than say 100 movies. There are some serious reviewers out there though. These are on the tail of the distribution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "degu, degm = bipartite.degrees(B, movies)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "du = [v for k,v in degu]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "plt.rcParams['figure.figsize'] = [10, 5]\n",
+ "ax = plt.hist(du)\n",
+ "plt.title(\"Histogram of the Number of Movies Viewed\")\n",
+ "plt.xlabel(\"Movies Viewed\")\n",
+ "plt.grid(True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Identified fact:\n",
+ "_Inspecting the degree distribution of vertices is an activity we commonly perform to understand some basic characteristics of the network. In this application, this activity shows that most user's rate less than a 100 movies. The graph above is an example of \"long tailed distribution\". This is commonly seen in social networks._"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How many users rate a movie?\n",
+ "This is also a distribution that also has the long tailed behavior. Most mvoies are rated by less than 100 users. There are some movies that are rated by many viewers though."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dm = [v for k,v in degm]\n",
+ "plt.rcParams['figure.figsize'] = [10, 5]\n",
+ "ax = plt.hist(dm)\n",
+ "plt.title(\"Histogram of the Number of Viewers Rating a Movie\")\n",
+ "plt.xlabel(\"Number of Viewers\")\n",
+ "plt.grid(True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Identified fact:\n",
+ "_We can repeat the exploration of degree distribution with the movie nodes. The behavior observed with the movie nodes is similar to what is observed with the user nodes. We observe the same \"long tailed\" distribution and most movies are rated by less than a 100 users._"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Centrality Measures\n",
+ "\n",
+ "A centrality measure identifies influential nodes in a network. How do we formalize the observations about movie importance and the user's rating behavior in the given network? The notion of centrality measure is useful in this regard. Many measures of node centrality are used, see [Chapter 3 of the text book by Zafarani et.al](http://dmml.asu.edu/smm/) for details. We will use degree centraility in this example (see the [documentation](https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.centrality.degree_centrality.html) for details). In the illustrations below, we will capture the top 10 users and movies in terms of importance as determined by this measure. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dc= bipartite.degree_centrality(B, users)\n",
+ "sdcu = {}\n",
+ "sdcm = {}\n",
+ "for k, v in sorted(dc.items(),reverse=True, key=lambda item: item[1]):\n",
+ " if k.startswith(\"Users\"):\n",
+ " sdcu[k] = v\n",
+ " else:\n",
+ " sdcm[k] = v\n",
+ "\n",
+ "del dc"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## List the top 10 users in terms of degree centrality\n",
+ "These guys rate a lot of movies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "list(sdcu.keys())[:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## List the top 10 movies in terms of degree centrality\n",
+ "These movies are rated by a lot of people"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list(sdcm.keys())[:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How does this importance measure vary over the user community?\n",
+ "We can get a sense of this by viewing the distribution of this measure."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "npvals = np.fromiter(sdcu.values(), dtype=float)\n",
+ "sns.set(rc={'figure.figsize':(11.7,8.27)})\n",
+ "ax = sns.distplot(npvals, hist = False)\n",
+ "plt.title(\"Distribution of User Degree Centrality\")\n",
+ "plt.xlabel(\"User Degree Centraility\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How does this importance measure vary over the movie community?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "npvals = np.fromiter(sdcm.values(), dtype=float)\n",
+ "sns.set(rc={'figure.figsize':(11.7,8.27)})\n",
+ "ax = sns.distplot(npvals, hist = False)\n",
+ "plt.title(\"Distribution of Movie Degree Centrality\")\n",
+ "plt.xlabel(\"Movie Degree Centraility\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Identified fact:\n",
+ "_We have a measure to capture the importance of a movie and a user on the basis of the rating activity performed on them. We have identified important users and movies in the network on the basis of this measure. We have also have information about how this importance measure varies over the users and movies in the data._"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "x1vtgYbAXpec"
+ },
+ "outputs": [],
+ "source": [
+ "m4v = [t[0] for t in g.in_edges('Movies/4')]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Similarity of nodes\n",
+ "Just like measures exist to quantify the importance of nodes in a network, there exist measures to quantify the similarity of nodes in a network. We will pick a random node in the network and characterize the similarity of nodes connected to it."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "y4W1MtL-RTjK"
+ },
+ "source": [
+ "## How similar are viewers of the movie 'Get Shorty'?\n",
+ "In this example, we will use the __Jaccard__ similarity as a measure of node similarity. We first get all pairs of users who have seen the movie and then compute the __Jaccard__ similarity between them. The details are shown below. Networkx provides an API for this purpose that we will use."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "z4WoMVchXtVq"
+ },
+ "outputs": [],
+ "source": [
+ "from itertools import combinations\n",
+ "m4vucmb = list(combinations(m4v, 2))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "3Z4VBUvkXyHl"
+ },
+ "outputs": [],
+ "source": [
+ "import networkx as nx\n",
+ "gp = g.to_undirected()\n",
+ "jcp = nx.jaccard_coefficient(gp, m4vucmb)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "f2i7Puh9X14N"
+ },
+ "outputs": [],
+ "source": [
+ "jcpv = []\n",
+ "for u, v, p in jcp:\n",
+ " jcpv.append(p)\n",
+ " #print('(%s, %s) -> %.8f' % (u, v, p))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## What does the distribution of User Similarity look like?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "sns.set(rc={'figure.figsize':(11.7,8.27)})\n",
+ "ax = sns.distplot(jcpv, hist = False)\n",
+ "plt.title(\"Distribution of Jaccard Similarity between Raters of Get Shorty\")\n",
+ "plt.xlabel(\"Jaccard Similarity\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Identified fact:\n",
+ "_Picking a random node in the graph, we characterized the similarity of nodes connected to it. In this dataset, we can use this idea to get a sense of how similar user's rating a movie tend to be. A review of the above figure shows that a range of similarities are observed._"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Bipartite Clustering\n",
+ "The next idea that we will investigate on the graph comes from [Latapy et.al](https://arxiv.org/abs/cond-mat/0611631). In this work, the authors extend the notion of clustering coefficients associated with a node to the case of bi-partite graphs. Since our graph is a bipartite graph, we will use this metric. See section VIII of the paper for a detailed explanation of the idea. Briefly, the clustering coefficient for a pair of nodes $u, v$, is defined as follows:\n",
+ "$$\n",
+ "\\begin{equation*}\n",
+ "cc(u,v) = \\frac{N(u)\\cap N(v)}{N(u) \\cup N(v)},\n",
+ "\\end{equation*}\n",
+ "$$\n",
+ "where:\n",
+ "\n",
+ "* N(u) is the neighborhood of node $u$. These are the nodes connected to $u$. For example, the movies user $u$ rates.\n",
+ "* N(v) is the neighborhood of node $v$.\n",
+ "Reviewing the above equation, it is evident that the clustering coefficient captures the shared neighborhood for a pair of nodes. Nodes associated with high clustering coefficient have many common neighbors. In the context of this problem, a pair of users with a high clustering coefficient suggests that these users have rated many movies in common. Similarly, a pair of movies with a high clustering coefficient suggests that these movies there are many common raters(users) for these movies. Networkx provides an implementation of this idea. The details of computing the clustering coefficients for the users and movies are shown below.\n",
+ "To obtain the clustering coefficient for a node $u$, we simply average the clustering coefficient of $u$ with other nodes that it shares common neighbors with. It is defined as follows:\n",
+ "\n",
+ "$$\n",
+ "\\begin{equation*}\n",
+ "cc(u) = \\frac{\\sum_{v \\in N(u)} cc(u,v)}{\\left| N(N(u) \\right|},\n",
+ "\\end{equation*}\n",
+ "$$\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cr = bipartite.clustering(B)\n",
+ "cu = {}\n",
+ "cm = {}\n",
+ "for k, v in sorted(cr.items(),reverse=True, key=lambda item: item[1]):\n",
+ " if k.startswith(\"Users\"):\n",
+ " cu[k] = v\n",
+ " else:\n",
+ " cm[k] = v\n",
+ "\n",
+ "del cr"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## List the top 10 users in terms of clustering tendencies \n",
+ "These users share a high degree of common movie rating activity with other users. If rating is used as a proxy for the act of liking or disliking a movie, then user's with high clustering values share rating activity with "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list(cu.keys())[:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## List the top 10 movies in terms of clustering tendencies \n",
+ "These movies have a high degree of common users rating them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "list(cm.keys())[:10]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How does user clustering tendency vary over the community?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "npvals = np.fromiter(cu.values(), dtype=float)\n",
+ "sns.set(rc={'figure.figsize':(11.7,8.27)})\n",
+ "ax = sns.distplot(npvals, hist = False)\n",
+ "plt.title(\"Distribution of User Clustering Coefficients\")\n",
+ "plt.xlabel(\"User Clustering Coefficients\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Note:\n",
+ "Note that the distribution of user clustering coefficients has two humps (bi-modal). This suggests that there are two user communities. This is borne out when we visualize a sample of the user community using a _tSNE_ plot later in this notebook."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## How does movie clustering tendency vary over the community?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "npvals = np.fromiter(cm.values(), dtype=float)\n",
+ "sns.set(rc={'figure.figsize':(11.7,8.27)})\n",
+ "ax = sns.distplot(npvals, hist = False)\n",
+ "plt.title(\"Distribution of Movie Clustering Coefficients\")\n",
+ "plt.xlabel(\"Movie Clustering Coefficients\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Note:\n",
+ "Note that the distribution of user clustering coefficients has three humps. This suggests that there are three movie clusters. \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Identified fact:\n",
+ "_We have identified users and movies that tend to have have a lot of common rating activity. We have identified that users and movies have clustering tendencies._"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Using Node2vec to obtain a graph embedding"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Machine learning techniques can be applied on graphs to determine vector representations of graph elements such as nodes and edges. Such a representation is called an _embedding_. We can use the Networkx representation of the graph associated with the 10 most clustered users and determine its embedding. We can use a technique called [_t Stochastic Neighbor Embedding_](https://www.youtube.com/watch?v=RJVL80Gg3lA&list=UUtXKDgv1AVoG88PLl8nGXmw) to plot a two dimensional representation of this subgraph. The details of doing this is provided below. Earlier we noted that the user clustering tendencies were bi-modal. A review of the embedding of the sub-graph associated with the 10 most clustered users shows that there are two clusters in the data."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Extract the sub-graph associated with the 10 most clustered users"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t10cu = list(cu.keys())[:10]\n",
+ "proj_user = nx.bipartite.projected_graph(B, t10cu)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(list(proj_user.edges()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "len(proj_user.nodes())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text",
+ "id": "Jv5XKBLfRTjd"
+ },
+ "source": [
+ "## Embed the sub-graph using Node2vec "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "e-fD-eNhX9Lh"
+ },
+ "outputs": [],
+ "source": [
+ "from node2vec import Node2Vec\n",
+ "node2vec = Node2Vec(proj_user, dimensions=64, walk_length=100, num_walks=100, workers=4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "6hu6Z0VPYDFK"
+ },
+ "outputs": [],
+ "source": [
+ "model = node2vec.fit(window=10, min_count=1, batch_words=4)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t10cu[2]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Apply the model\n",
+ "We can ask questions like \"Who is most similar to (a particular user) Users/64?\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {},
+ "colab_type": "code",
+ "id": "41YVYl3jYHia"
+ },
+ "outputs": [],
+ "source": [
+ "model.wv.most_similar(t10cu[2])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Embed the graph with tSNE"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "t10cu_emb = { n: list(model.wv.get_vector(n)) for n in proj_user.nodes()}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "from matplotlib import cm\n",
+ "from sklearn.manifold import TSNE\n",
+ "fig = plt.figure(figsize=(11.7,8.27))\n",
+ "plt.grid(True)\n",
+ "plt.xlabel(\"Comp 1\")\n",
+ "plt.ylabel(\"Comp 2\")\n",
+ "plt.title(\"tSNE plot of subgraph of top 10 most clustered users\")\n",
+ "t10cu_embedded = TSNE(n_components=2).fit_transform(list(t10cu_emb.values()))\n",
+ "sns.scatterplot(t10cu_embedded[:,0], t10cu_embedded[:,1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "__Using the Networkx adapter we have been able to perform a range of analysis tasks using the Networkx API. We were also able use the Networkx graph to obtain an embedding of a graph using Node2vec.__"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Store Embeddings in ArangoDB"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%time\n",
+ "collection = db[\"Users\"]\n",
+ "with BulkOperation(collection, batchSize=500) as col:\n",
+ " #user_col = db.collections[\"Users\"]\n",
+ " for u, e in t10cu_emb.items():\n",
+ " the_key = u.split('/')[1]\n",
+ " the_user = collection[the_key]\n",
+ " the_user[\"n2v_emb\"] = e\n",
+ " the_user.patch()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Store metadata about these experiments using Arangopipe"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from arangopipe.arangopipe_storage.arangopipe_api import ArangoPipe\n",
+ "from arangopipe.arangopipe_storage.arangopipe_admin_api import ArangoPipeAdmin\n",
+ "from arangopipe.arangopipe_storage.arangopipe_config import ArangoPipeConfig\n",
+ "from arangopipe.arangopipe_storage.managed_service_conn_parameters import ManagedServiceConnParam\n",
+ "mdb_config = ArangoPipeConfig()\n",
+ "msc = ManagedServiceConnParam()\n",
+ "conn_params = { msc.DB_SERVICE_HOST : \"arangoml.arangodb.cloud\", \\\n",
+ " msc.DB_SERVICE_END_POINT : \"createDB\",\\\n",
+ " msc.DB_SERVICE_NAME : \"createDB\",\\\n",
+ " msc.DB_SERVICE_PORT : 8529,\\\n",
+ " msc.DB_CONN_PROTOCOL : 'https'}\n",
+ " \n",
+ "mdb_config = mdb_config.create_connection_config(conn_params)\n",
+ "admin = ArangoPipeAdmin(reuse_connection = False, config = mdb_config)\n",
+ "ap_config = admin.get_config()\n",
+ "ap = ArangoPipe(config = ap_config)\n",
+ "proj_info = {\"name\": \"IMDB_Movie_Reviews\"}\n",
+ "proj_reg = admin.register_project(proj_info)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import io\n",
+ "import requests\n",
+ "url = ('https://raw.githubusercontent.com/arangoml/networkx-adapter/master/examples/IMDB_Networkx_Adapter.ipynb')\n",
+ "nbjson = requests.get(url).text\n",
+ "model_info = {\"name\": \"Exploratory Data Analysis IMDB\", \"task\": \"multiple\", 'notebook': nbjson}\n",
+ "model_reg = ap.register_model(model_info, project = \"IMDB_Movie_Reviews\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Summary"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* The Networkx-Adapter makes it easy to obtain Networkx graphs from ArangoDB graphs.\n",
+ "* We have demonstrated that using a graph representation helped us identify facts about the data. These facts were identified by leveraging standard ideas from Network Theory.\n",
+ "* We have leveraged the Networkx graph to obtain a _Node2vec_ embedding of the graph associated with our data. For an example of how the Networkx-Adapter can be used with a _deep learning_ library, please view the [ITSM ArangoDB Adapter](https://github.com/arangoml/networkx-adapter/blob/master/examples/ITSM_ArangoDB_Adapter.ipynb). "
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "include_colab_link": true,
+ "name": "IMDB_Networkx_Adapter.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/examples/__pycache__/oasis.cpython-37.pyc b/examples/__pycache__/oasis.cpython-37.pyc
index 4d2ee973..ecc57197 100644
Binary files a/examples/__pycache__/oasis.cpython-37.pyc and b/examples/__pycache__/oasis.cpython-37.pyc differ
diff --git a/examples/creds.dat b/examples/creds.dat
index dfd3bf60..c6a4bc6d 100644
Binary files a/examples/creds.dat and b/examples/creds.dat differ