modifying paper and author networks (#39)

* modifying paper and author networks * modifying requirements * modifying python actions * modifying python actions * modifying python actions * removing logs and sorting
adsabs · Jun 5, 2024 · 41abd49 · 41abd49
1 parent e527925
commit 41abd49
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 70 deletions.
diff --git a/.github/workflows/python_actions.yml b/.github/workflows/python_actions.yml
@@ -23,9 +23,10 @@ jobs:
         
     - name: Test with pytest
       run: |
-        py.test
+        py.test --cov=.
         
-    - uses: actions/upload-artifact@v2
+    - name: Upload coverage report
+      uses: actions/upload-artifact@v4
       with:
         name: coverage-vis
         path: .coverage
@@ -47,7 +48,9 @@ jobs:
         python -m pip install --upgrade wheel setuptools pip
         pip install coverage==5.2.1
         pip install coveralls==2.2.0
-    - uses: actions/download-artifact@master
+    
+    - name: Download coverage report
+      uses: actions/download-artifact@v4
       with:
         name: coverage-vis
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,9 @@ networkx==2.5.1
 numpy==1.21.1
 python-louvain==0.15
 spacy==3.1.0
+Jinja2==2.11.3
+itsdangerous<=2.0.1
+werkzeug<=2.0.3
+markupsafe<=2.0.1 
+typing-inspect==0.8.0
+typing_extensions==4.5.0
diff --git a/vis_services/lib/author_network.py b/vis_services/lib/author_network.py
@@ -9,7 +9,6 @@
 import networkx as nx
 import community
 import math
-from networkx.readwrite import json_graph
 from collections import defaultdict
 
 
@@ -55,7 +54,8 @@ def translate(value, leftMin, leftMax, rightMin, rightMax):
     #I define a new dictionary where to put the results
     ret_dic = {}
     #I extract the values from the dictionary
-    dict_values = list(mydict.values())
+    dict_values = mydict.values()
+
     if len(dict_values) > 0:
         #and the max and min
         minvalue = min(dict_values)
@@ -116,32 +116,28 @@ def augment_graph_data(author_graph, data):
     # create the networkx graph
     G = nx.Graph()
 
-    # create a backwards dict from name to index
-    index_dict = {x["nodeName"]: i for i, x in enumerate(author_graph['nodes'])}
-
     for i,x in enumerate(author_graph['nodes']):
         G.add_node(i, nodeName= x["nodeName"], nodeWeight = x["nodeWeight"], delete=x["delete"])
 
     for i,x in enumerate(author_graph['links']):
         G.add_edge(x["source"], x["target"], weight = x["value"])
 
-    all_nodes = G.nodes()
-
+
     #remove nodes marked "delete" before we generate the groups!
     for x in list(G.nodes(True)):
         if x[1]["delete"] == True:
             G.remove_node(x[0])
 
     #attach group names to all nodes
-    partition = community.best_partition(G)
+    partition = community.best_partition(G, random_state=0)
 
     #make dict from group to list of items
     group_to_author_dict = defaultdict(list)
     for author in partition:
         group = partition[author]
         group_to_author_dict[group].append(author)
 
- #  create two level structure
+    #create two level structure
     #add groups
     groups = []
     for g in group_to_author_dict:
@@ -170,7 +166,7 @@ def augment_graph_data(author_graph, data):
     link_data = [[l[0], l[1], l[2]["weight"]] for l in link_data]
     #remove inter-group links
     link_data = [l for l in link_data if partition[l[0]] != partition[l[1]]]
-    return {"root": top_level, "bibcode_dict":bib_dict, "link_data" : link_data}
+    return {"root": top_level, "bibcode_dict":bib_dict, "link_data" : sorted(link_data)}
 
 
 #Giovanni's original author network building function, with data processed by the group function

diff --git a/vis_services/lib/paper_network.py b/vis_services/lib/paper_network.py
@@ -5,22 +5,20 @@
 '''
 
 # general module imports
-import sys
 import os
-import time
 import operator
 from . import histeq
 from numpy import mat
 from numpy import zeros
 from numpy import fill_diagonal
-from numpy import sqrt, ones, multiply, array
+from numpy import sqrt, array
 import numpy
+import json
 
 import networkx as nx
 import community
 import math
 from networkx.readwrite import json_graph
-from collections import defaultdict
 
 from . import tf_idf
 
@@ -92,7 +90,7 @@ def augment_graph_data(data, max_groups):
 
     #partition is a dictionary with group names as keys
     # and individual node indexes as values
-    partition = community.best_partition(G)
+    partition = community.best_partition(G, random_state=0)
 
     for g in G.nodes():
         G.nodes[g]["group"] = partition[g]
@@ -157,7 +155,7 @@ def augment_graph_data(data, max_groups):
                     else:
                         references[bib] = set([paper_one, paper_two])
 
-        count_references = sorted(references.items(), key=lambda x:len(x[1]), reverse = True)[:5]
+        count_references = sorted(references.items(), key=lambda x:(len(x[1]), x[0]), reverse = True)[:5]
         top_common_references = [(tup[0], float("{0:.2f}".format(len(tup[1])/num_papers))) for tup in count_references]
         top_common_references = dict(top_common_references)
         summary_graph.nodes[x]["top_common_references"] = top_common_references
@@ -169,9 +167,9 @@ def augment_graph_data(data, max_groups):
         for possible_real_index, node in enumerate(summary_json["nodes"]):
             if node == n:
                 real_index = possible_real_index
-        summary_json["nodes"][real_index]["node_name"] = i +1
-
+        summary_json["nodes"][real_index]["node_name"] = i + 1
 
+    # NOTE: We should remove this altogether and take the opportunity to change this in Nectar
     # NOTE: From Python 2 to 3 transition
     # Older networkx versions were producing a links structure using positional ids
     # so we created the artificial key 'stable_index' to make things easier on the front-end side
@@ -208,7 +206,6 @@ def get_papernetwork(solr_data, max_groups, weighted=True, equalization=False, d
     '''
     # Get get paper list from the Solr data
     papers_list = [a['bibcode'] for a in solr_data]
-    number_of_papers = len(papers_list)
     # First construct the reference dictionary, and a unique list of cited papers
     reference_dictionary = _get_reference_mapping(solr_data)
     # From now on we'll only work with publications that actually have references
@@ -218,12 +215,12 @@ def get_papernetwork(solr_data, max_groups, weighted=True, equalization=False, d
     # transform that list into a dictionary for fast lookup
     ref_list = dict(zip(ref_list, list(range(len(ref_list)))))
     empty_vec = [0]*len(ref_list)
-    # Construct the paper-citation occurence matrix R
+    # Construct the paper-citation occurrence matrix R
     entries = []
-    for p in papers:
+    for paper in papers:
         vec = empty_vec[:]
-        ref_ind = [ref_list.get(a) for a in reference_dictionary[p]]
-        for entry in ref_ind:
+        ref_indexes = [ref_list.get(reference) for reference in reference_dictionary[paper]]
+        for entry in ref_indexes:
             vec[entry] = 1
         entries.append(vec)
     #done with ref_list
@@ -247,7 +244,7 @@ def get_papernetwork(solr_data, max_groups, weighted=True, equalization=False, d
             W = numpy.concatenate(weights)
         # Done with weights
         del weights
-        # Get the co-occurence matrix C
+        # Get the co-occurrence matrix C
         C = R.T*(R-W)
     else:
         C = R.T*R

diff --git a/vis_services/tests/stubdata/test_output/paper_network_star.json b/vis_services/tests/stubdata/test_output/paper_network_star.json
diff --git a/vis_services/tests/test_internals.py b/vis_services/tests/test_internals.py
@@ -1,11 +1,10 @@
-import sys, os, copy
+import sys, os
 from flask_testing import TestCase
 import httpretty
 import json
 from collections import defaultdict
 PROJECT_HOME = os.path.abspath(os.path.join(os.path.dirname(__file__),'../../'))
 sys.path.append(PROJECT_HOME)
-import requests
 from vis_services import app
 from vis_services.lib import word_cloud
 from vis_services.lib import author_network
@@ -78,13 +77,8 @@ def test_author_network_resource(self):
         # testing entire function
 
         processed_data = json.loads(json.dumps(author_network.augment_graph_data(input_js_author_network, input_js_data_parameter), sort_keys=True))
-        # self.assertEqual(processed_data, test_js_author_network)
-        self.assertEqual(processed_data['bibcode_dict'], test_js_author_network['bibcode_dict'])
-        self.assertEqual(processed_data['root'], test_js_author_network['root'])
-        # order of link data doesn't match, but should that matter?
-        self.assertEqual(len(processed_data['link_data']), len(test_js_author_network['link_data']))
-        for e in test_js_author_network['link_data']:
-            self.assertTrue(e in processed_data['link_data'])
+
+        self.assertEqual(processed_data, test_js_author_network)
 
     def test_paper_network_resource(self):
 
@@ -129,40 +123,9 @@ def get_group_references(group):
         # now just test input/output
 
         test_js_paper_network =  json.load(open(STUBDATA_DIR + "/test_output/paper_network_star.json"))
-
         processed_data = json.loads(json.dumps(paper_network.get_papernetwork(input_js_paper_network["response"]["docs"], 10), sort_keys=True))
-        # note for the reviewer:
-        # keys in 'fullGraph' dict: 
-        # 'directed', 'graph', 'links', 'multigraph', 'nodes'
-        links_values = processed_data['fullGraph']['links']
-        self.assertEqual(processed_data['fullGraph']['directed'], test_js_paper_network['fullGraph']['directed'])
-        self.assertEqual(processed_data['fullGraph']['graph'], test_js_paper_network['fullGraph']['graph'])
-        self.assertEqual(processed_data['fullGraph']['multigraph'], test_js_paper_network['fullGraph']['multigraph'])
-        # for 'nodes', the value for group doesn't match, for example:
-        # {'citation_count': 21, 'first_author': 'Katz, J.', 'group': 6, 'id': 7, 'nodeWeight': 21, 'node_name': '1978ApJ...223..299K', 'read_count': 8, 'title': 'Steepest descent technique and stellar equilibrium statistical mechanics. IV. Gravitating systems with an energy cutoff.'}
-        # {'citation_count': 21, 'first_author': 'Katz, J.', 'group': 3, 'id': 7, 'nodeWeight': 21, 'node_name': '1978ApJ...223..299K', 'read_count': 8, 'title': 'Steepest descent technique and stellar equilibrium statistical mechanics. IV. Gravitating systems with an energy cutoff.'}]
-        processed_data_tmp = copy.deepcopy(processed_data['fullGraph']['nodes'])
-        for x in processed_data_tmp:
-            x.pop('group')
-        test_js_paper_network_tmp = copy.deepcopy(test_js_paper_network['fullGraph']['nodes'])
-        for x in test_js_paper_network_tmp:
-            x.pop('group')
-        for x in processed_data_tmp:
-            self.assertTrue(x in test_js_paper_network_tmp)
-
-        # links comparison test fails when a value for overlap is not found
-        # for example, this is not found:
-        # {'overlap': ['1985A&A...150...33B', '1986A&AS...66..191B', '1988AJ.....96..635E'], 'source': 1, 'target': 44, 'weight': 4}
-
-        # self.assertEqual(processed_data['fullGraph']['links'], test_js_paper_network['fullGraph']['links'])
-        for x in test_js_paper_network['fullGraph']['links']:
-            if x['overlap'] == ['1988A&A...196...84C', '1985ApJ...299..211E']:
-                print(x)
-        mismatch_count = 0
-        for x in test_js_paper_network['fullGraph']['links']:
-           if x not in links_values:
-               mismatch_count += 1
-        print('fullGraph.links mismatch count: {}'.format(mismatch_count))
+
+        self.assertCountEqual(processed_data, test_js_paper_network)
 
 class TestAppLogic(TestCase):