Skip to content

Commit

Permalink
modifying paper and author networks (#39)
Browse files Browse the repository at this point in the history
* modifying paper and author networks

* modifying requirements

* modifying python actions

* modifying python actions

* modifying python actions

* removing logs and sorting
  • Loading branch information
femalves authored Jun 5, 2024
1 parent e527925 commit 41abd49
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 70 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/python_actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ jobs:
- name: Test with pytest
run: |
py.test
py.test --cov=.
- uses: actions/upload-artifact@v2
- name: Upload coverage report
uses: actions/upload-artifact@v4
with:
name: coverage-vis
path: .coverage
Expand All @@ -47,7 +48,9 @@ jobs:
python -m pip install --upgrade wheel setuptools pip
pip install coverage==5.2.1
pip install coveralls==2.2.0
- uses: actions/download-artifact@master
- name: Download coverage report
uses: actions/download-artifact@v4
with:
name: coverage-vis

Expand Down
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@ networkx==2.5.1
numpy==1.21.1
python-louvain==0.15
spacy==3.1.0
Jinja2==2.11.3
itsdangerous<=2.0.1
werkzeug<=2.0.3
markupsafe<=2.0.1
typing-inspect==0.8.0
typing_extensions==4.5.0
16 changes: 6 additions & 10 deletions vis_services/lib/author_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import networkx as nx
import community
import math
from networkx.readwrite import json_graph
from collections import defaultdict


Expand Down Expand Up @@ -55,7 +54,8 @@ def translate(value, leftMin, leftMax, rightMin, rightMax):
#I define a new dictionary where to put the results
ret_dic = {}
#I extract the values from the dictionary
dict_values = list(mydict.values())
dict_values = mydict.values()

if len(dict_values) > 0:
#and the max and min
minvalue = min(dict_values)
Expand Down Expand Up @@ -116,32 +116,28 @@ def augment_graph_data(author_graph, data):
# create the networkx graph
G = nx.Graph()

# create a backwards dict from name to index
index_dict = {x["nodeName"]: i for i, x in enumerate(author_graph['nodes'])}

for i,x in enumerate(author_graph['nodes']):
G.add_node(i, nodeName= x["nodeName"], nodeWeight = x["nodeWeight"], delete=x["delete"])

for i,x in enumerate(author_graph['links']):
G.add_edge(x["source"], x["target"], weight = x["value"])

all_nodes = G.nodes()


#remove nodes marked "delete" before we generate the groups!
for x in list(G.nodes(True)):
if x[1]["delete"] == True:
G.remove_node(x[0])

#attach group names to all nodes
partition = community.best_partition(G)
partition = community.best_partition(G, random_state=0)

#make dict from group to list of items
group_to_author_dict = defaultdict(list)
for author in partition:
group = partition[author]
group_to_author_dict[group].append(author)

# create two level structure
#create two level structure
#add groups
groups = []
for g in group_to_author_dict:
Expand Down Expand Up @@ -170,7 +166,7 @@ def augment_graph_data(author_graph, data):
link_data = [[l[0], l[1], l[2]["weight"]] for l in link_data]
#remove inter-group links
link_data = [l for l in link_data if partition[l[0]] != partition[l[1]]]
return {"root": top_level, "bibcode_dict":bib_dict, "link_data" : link_data}
return {"root": top_level, "bibcode_dict":bib_dict, "link_data" : sorted(link_data)}


#Giovanni's original author network building function, with data processed by the group function
Expand Down
25 changes: 11 additions & 14 deletions vis_services/lib/paper_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,20 @@
'''

# general module imports
import sys
import os
import time
import operator
from . import histeq
from numpy import mat
from numpy import zeros
from numpy import fill_diagonal
from numpy import sqrt, ones, multiply, array
from numpy import sqrt, array
import numpy
import json

import networkx as nx
import community
import math
from networkx.readwrite import json_graph
from collections import defaultdict

from . import tf_idf

Expand Down Expand Up @@ -92,7 +90,7 @@ def augment_graph_data(data, max_groups):

#partition is a dictionary with group names as keys
# and individual node indexes as values
partition = community.best_partition(G)
partition = community.best_partition(G, random_state=0)

for g in G.nodes():
G.nodes[g]["group"] = partition[g]
Expand Down Expand Up @@ -157,7 +155,7 @@ def augment_graph_data(data, max_groups):
else:
references[bib] = set([paper_one, paper_two])

count_references = sorted(references.items(), key=lambda x:len(x[1]), reverse = True)[:5]
count_references = sorted(references.items(), key=lambda x:(len(x[1]), x[0]), reverse = True)[:5]
top_common_references = [(tup[0], float("{0:.2f}".format(len(tup[1])/num_papers))) for tup in count_references]
top_common_references = dict(top_common_references)
summary_graph.nodes[x]["top_common_references"] = top_common_references
Expand All @@ -169,9 +167,9 @@ def augment_graph_data(data, max_groups):
for possible_real_index, node in enumerate(summary_json["nodes"]):
if node == n:
real_index = possible_real_index
summary_json["nodes"][real_index]["node_name"] = i +1

summary_json["nodes"][real_index]["node_name"] = i + 1

# NOTE: We should remove this altogether and take the opportunity to change this in Nectar
# NOTE: From Python 2 to 3 transition
# Older networkx versions were producing a links structure using positional ids
# so we created the artificial key 'stable_index' to make things easier on the front-end side
Expand Down Expand Up @@ -208,7 +206,6 @@ def get_papernetwork(solr_data, max_groups, weighted=True, equalization=False, d
'''
# Get get paper list from the Solr data
papers_list = [a['bibcode'] for a in solr_data]
number_of_papers = len(papers_list)
# First construct the reference dictionary, and a unique list of cited papers
reference_dictionary = _get_reference_mapping(solr_data)
# From now on we'll only work with publications that actually have references
Expand All @@ -218,12 +215,12 @@ def get_papernetwork(solr_data, max_groups, weighted=True, equalization=False, d
# transform that list into a dictionary for fast lookup
ref_list = dict(zip(ref_list, list(range(len(ref_list)))))
empty_vec = [0]*len(ref_list)
# Construct the paper-citation occurence matrix R
# Construct the paper-citation occurrence matrix R
entries = []
for p in papers:
for paper in papers:
vec = empty_vec[:]
ref_ind = [ref_list.get(a) for a in reference_dictionary[p]]
for entry in ref_ind:
ref_indexes = [ref_list.get(reference) for reference in reference_dictionary[paper]]
for entry in ref_indexes:
vec[entry] = 1
entries.append(vec)
#done with ref_list
Expand All @@ -247,7 +244,7 @@ def get_papernetwork(solr_data, max_groups, weighted=True, equalization=False, d
W = numpy.concatenate(weights)
# Done with weights
del weights
# Get the co-occurence matrix C
# Get the co-occurrence matrix C
C = R.T*(R-W)
else:
C = R.T*R
Expand Down

Large diffs are not rendered by default.

47 changes: 5 additions & 42 deletions vis_services/tests/test_internals.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import sys, os, copy
import sys, os
from flask_testing import TestCase
import httpretty
import json
from collections import defaultdict
PROJECT_HOME = os.path.abspath(os.path.join(os.path.dirname(__file__),'../../'))
sys.path.append(PROJECT_HOME)
import requests
from vis_services import app
from vis_services.lib import word_cloud
from vis_services.lib import author_network
Expand Down Expand Up @@ -78,13 +77,8 @@ def test_author_network_resource(self):
# testing entire function

processed_data = json.loads(json.dumps(author_network.augment_graph_data(input_js_author_network, input_js_data_parameter), sort_keys=True))
# self.assertEqual(processed_data, test_js_author_network)
self.assertEqual(processed_data['bibcode_dict'], test_js_author_network['bibcode_dict'])
self.assertEqual(processed_data['root'], test_js_author_network['root'])
# order of link data doesn't match, but should that matter?
self.assertEqual(len(processed_data['link_data']), len(test_js_author_network['link_data']))
for e in test_js_author_network['link_data']:
self.assertTrue(e in processed_data['link_data'])

self.assertEqual(processed_data, test_js_author_network)

def test_paper_network_resource(self):

Expand Down Expand Up @@ -129,40 +123,9 @@ def get_group_references(group):
# now just test input/output

test_js_paper_network = json.load(open(STUBDATA_DIR + "/test_output/paper_network_star.json"))

processed_data = json.loads(json.dumps(paper_network.get_papernetwork(input_js_paper_network["response"]["docs"], 10), sort_keys=True))
# note for the reviewer:
# keys in 'fullGraph' dict:
# 'directed', 'graph', 'links', 'multigraph', 'nodes'
links_values = processed_data['fullGraph']['links']
self.assertEqual(processed_data['fullGraph']['directed'], test_js_paper_network['fullGraph']['directed'])
self.assertEqual(processed_data['fullGraph']['graph'], test_js_paper_network['fullGraph']['graph'])
self.assertEqual(processed_data['fullGraph']['multigraph'], test_js_paper_network['fullGraph']['multigraph'])
# for 'nodes', the value for group doesn't match, for example:
# {'citation_count': 21, 'first_author': 'Katz, J.', 'group': 6, 'id': 7, 'nodeWeight': 21, 'node_name': '1978ApJ...223..299K', 'read_count': 8, 'title': 'Steepest descent technique and stellar equilibrium statistical mechanics. IV. Gravitating systems with an energy cutoff.'}
# {'citation_count': 21, 'first_author': 'Katz, J.', 'group': 3, 'id': 7, 'nodeWeight': 21, 'node_name': '1978ApJ...223..299K', 'read_count': 8, 'title': 'Steepest descent technique and stellar equilibrium statistical mechanics. IV. Gravitating systems with an energy cutoff.'}]
processed_data_tmp = copy.deepcopy(processed_data['fullGraph']['nodes'])
for x in processed_data_tmp:
x.pop('group')
test_js_paper_network_tmp = copy.deepcopy(test_js_paper_network['fullGraph']['nodes'])
for x in test_js_paper_network_tmp:
x.pop('group')
for x in processed_data_tmp:
self.assertTrue(x in test_js_paper_network_tmp)

# links comparison test fails when a value for overlap is not found
# for example, this is not found:
# {'overlap': ['1985A&A...150...33B', '1986A&AS...66..191B', '1988AJ.....96..635E'], 'source': 1, 'target': 44, 'weight': 4}

# self.assertEqual(processed_data['fullGraph']['links'], test_js_paper_network['fullGraph']['links'])
for x in test_js_paper_network['fullGraph']['links']:
if x['overlap'] == ['1988A&A...196...84C', '1985ApJ...299..211E']:
print(x)
mismatch_count = 0
for x in test_js_paper_network['fullGraph']['links']:
if x not in links_values:
mismatch_count += 1
print('fullGraph.links mismatch count: {}'.format(mismatch_count))

self.assertCountEqual(processed_data, test_js_paper_network)

class TestAppLogic(TestCase):

Expand Down

0 comments on commit 41abd49

Please sign in to comment.