Merge branch 'master' of github.com:callahantiff/PheKnowLator

callahantiff · Apr 13, 2021 · 5d77b72 · 5d77b72
2 parents e1d2f37 + cc1658e
commit 5d77b72
Show file tree

Hide file tree

Showing 22 changed files with 326 additions and 289 deletions.
diff --git a/builds/build_phase_3.py b/builds/build_phase_3.py
@@ -32,39 +32,6 @@
 logging.config.fileConfig(log_config[0], disable_existing_loggers=False, defaults={'log_file': log_dir + '/' + log})
 
 
-def derives_networkx_graph_statistics(graph) -> str:
-    """Derives statistics from an input knowledge graph and prints them to the console. Note that we are not
-    converting each node to a string before deriving our counts. This is purposeful as the number of unique nodes is
-    altered when you it converted to a string. For example, in the HPO when honoring the RDF type of each node there are
-    406,717 unique nodes versus 406,331 unique nodes when ignoring the RDF type of each node.
-
-    Args:
-        graph: An networkx.MultiDiGraph object.
-
-    Returns:
-        stats: A formatted string containing descriptive statistics.
-    """
-
-    # derive statistics
-    nx_graph_und = graph.to_undirected()
-    nodes = networkx.number_of_nodes(graph); edges = networkx.number_of_edges(graph)
-    self_loops = networkx.number_of_selfloops(graph)
-    ce = sorted(Counter([str(x[2]) for x in graph.edges(keys=True)]).items(),  # type: ignore
-                key=lambda x: x[1], reverse=1)[:6]  # type: ignore
-    avg_degree = float(edges) / nodes
-    n_deg = sorted([(str(x[0]), x[1]) for x in graph.degree()], key=lambda x: x[1], reverse=1)[:6]  # type: ignore
-    density = networkx.density(graph)
-    components = sorted(list(networkx.connected_components(nx_graph_und)), key=len, reverse=True)
-    cc_sizes = {x: len(components[x]) for x in range(len(components))}
-    x = '{} nodes, {} edges, {} self-loops, 5 most most common edges: {}, average degree {}, 5 highest degree '\
-        'nodes: {}, density: {}, {} component(s) and size(s): {}'
-    stats = 'Graph Stats: ' + x.format(nodes, edges, self_loops, ', '.join([x[0] + ':' + str(x[1]) for x in ce]),
-                                       avg_degree, ', '.join([x[0] + ':' + str(x[1]) for x in n_deg]),
-                                       density, len(components), cc_sizes)
-
-    return stats
-
-
 def uploads_build_data(bucket, gcs_location) -> None:
     """Moves data from docker container to the dedicated Google Cloud Storage Bucket directory.
 
@@ -188,25 +155,8 @@ def main(app, rel, owl):
     uploads_data_to_gcs_bucket(bucket, gcs_log_location, log_dir, log)  # uploads log to gcs bucket
 
     #############################################################################
-    # STEP 4 - PRINT BUILD STATISTICS
-    logs = 'STEP 4: DERIVING NETWORK STATISTICS FOR BUILD KNOWLEDGE GRAPHS'; print('\n' + logs); logger.info(logs)
-
-    try:  # find Networkx MultiDiGraph files in Google Cloud Storage Bucket for build
-        kg_owl = [f.name for f in bucket.list_blobs(prefix=gcs_current_loc_owl) if f.name.endswith('gpickle')]
-        kg_owlnets = [f.name for f in bucket.list_blobs(prefix=gcs_current_loc_owlnets) if f.name.endswith('gpickle')]
-        for f in set(kg_owl + kg_owlnets):
-            log_str = 'Loading graph data: {}'.format(f.split('/')[-1]); print(log_str); logger.info(log_str)
-            bucket_loc = gcs_current_loc_owlnets if 'OWLNETS' in f else gcs_current_loc_owl
-            nx_local_file = downloads_data_from_gcs_bucket(bucket, None, bucket_loc, f.split('/')[-1], '')
-            graph = networkx.read_gpickle(nx_local_file)
-            stats = derives_networkx_graph_statistics(graph); print(stats); logger.info(stats)
-    except: logger.error('ERROR: Uncaught Exception: {}'.format(traceback.format_exc()))
-
-    uploads_data_to_gcs_bucket(bucket, gcs_log_location, log_dir, log)  # uploads log to gcs bucket
-
-    #############################################################################
-    # STEP 5 - CLEAN UP BUILD ENVIRONMENT + LOG EXIT STATUS TO FINISH RUN
-    print('\nSTEP 5: BUILD CLEAN-UP'); logger.info('STEP 5: BUILD CLEAN-UP')
+    # STEP 4 - CLEAN UP BUILD ENVIRONMENT + LOG EXIT STATUS TO FINISH RUN
+    print('\nSTEP 4: BUILD CLEAN-UP'); logger.info('STEP 4: BUILD CLEAN-UP')
     runtime = round((datetime.now() - start_time).total_seconds() / 60, 3)
     print('\n\n' + '*' * 5 + ' COMPLETED BUILD PHASE 3: {} MINUTES '.format(runtime) + '*' * 5)
     logger.info('COMPLETED BUILD PHASE 3: {} MINUTES'.format(runtime)); logger.info('EXIT BUILD PHASE 3')
@@ -218,12 +168,13 @@ def main(app, rel, owl):
     # owl build
     copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_archive_loc_owl, [log_1[0].split('/')[-1]])
     copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_current_loc_owl, [log_1[0].split('/')[-1]])
-    copies_data_between_gcs_bucket_directories(bucket, gcs_archive_loc_owl, gcs_current_loc_owl, [log])
+    copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_archive_loc_owl, [log])
+    copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_current_loc_owl, [log])
     # owl-nets build
     copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_archive_loc_owlnets, [log_1[0].split('/')[-1]])
     copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_current_loc_owlnets, [log_1[0].split('/')[-1]])
-    copies_data_between_gcs_bucket_directories(bucket, gcs_archive_loc_owl, gcs_archive_loc_owlnets, [log])
-    copies_data_between_gcs_bucket_directories(bucket, gcs_current_loc_owl, gcs_current_loc_owlnets, [log])
+    copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_archive_loc_owlnets, [log])
+    copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_current_loc_owlnets, [log])
 
     # exit build
     uploads_data_to_gcs_bucket(bucket, gcs_log_location, log_dir, log)  # uploads log to gcs bucket

diff --git a/notebooks/Data_Preparation.ipynb b/notebooks/Data_Preparation.ipynb
@@ -103,9 +103,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # uncomment and update to install any required modules\n",
+    "# # uncomment and run to install any required modules from notebooks/requirements.txt\n",
     "# import sys\n",
-    "# !{sys.executable} -m pip install pkt_kg rdflib reactome2py"
+    "# !{sys.executable} -m pip install -r requirements.txt"
    ]
   },
   {

diff --git a/notebooks/OWLNETS_Example_Application.ipynb b/notebooks/OWLNETS_Example_Application.ipynb
@@ -94,9 +94,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # uncomment and update to install any required modules\n",
+    "# # uncomment and run to install any required modules from notebooks/requirements.txt\n",
     "# import sys\n",
-    "# !{sys.executable} -m pip install pkt_kg rdflib"
+    "# !{sys.executable} -m pip install -r requirements.txt"
    ]
   },
   {

diff --git a/notebooks/Ontology_Cleaning.ipynb b/notebooks/Ontology_Cleaning.ipynb
@@ -178,9 +178,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # uncomment and update to install any required modules\n",
+    "# # uncomment and run to install any required modules from notebooks/requirements.txt\n",
     "# import sys\n",
-    "# !{sys.executable} -m pip install glob pkt_kg pickle rdflib tqdm"
+    "# !{sys.executable} -m pip install -r requirements.txt"
    ]
   },
   {

diff --git a/notebooks/RDF_Graph_Processing_Example.ipynb b/notebooks/RDF_Graph_Processing_Example.ipynb
@@ -61,9 +61,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# # uncomment and update to install any required modules\n",
+    "# # uncomment and run to install any required modules from notebooks/requirements.txt\n",
     "# import sys\n",
-    "# !{sys.executable} -m pip install networkx pkt_kg rdflib"
+    "# !{sys.executable} -m pip install -r requirements.txt"
    ]
   },
   {
@@ -860,7 +860,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.8"
+   "version": "3.6.2"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt
@@ -0,0 +1,15 @@
+Cython>=0.29.14
+more-itertools
+networkx
+numpy>=1.18.1
+openpyxl>=3.0.3
+pandas>=1.0.5
+psutil
+python-json-logger
+ray
+rdflib
+reactome2py
+requests
+responses==0.10.12
+tqdm
+urllib3
diff --git a/pkt_kg/construction_approaches.py b/pkt_kg/construction_approaches.py
@@ -92,10 +92,10 @@ def maps_node_to_class(self, edge_type: str, entity: str) -> Optional[List]:
             non-class entity node is returned.
         """
 
-        e_type = edge_type
         if entity not in self.subclass_dict.keys():
-            if self.subclass_error and e_type in self.subclass_error.keys(): self.subclass_error[e_type] += [entity]
-            else: self.subclass_error[e_type] = [entity]
+            if self.subclass_error and edge_type in self.subclass_error.keys():
+                if entity not in self.subclass_error[edge_type]: self.subclass_error[edge_type] += [entity]
+            else: self.subclass_error[edge_type] = [entity]
             subclass_map = None
         else: subclass_map = self.subclass_dict[entity]