diff --git a/builds/build_phase_3.py b/builds/build_phase_3.py index f03fc5a9..6f52be60 100755 --- a/builds/build_phase_3.py +++ b/builds/build_phase_3.py @@ -32,39 +32,6 @@ logging.config.fileConfig(log_config[0], disable_existing_loggers=False, defaults={'log_file': log_dir + '/' + log}) -def derives_networkx_graph_statistics(graph) -> str: - """Derives statistics from an input knowledge graph and prints them to the console. Note that we are not - converting each node to a string before deriving our counts. This is purposeful as the number of unique nodes is - altered when you it converted to a string. For example, in the HPO when honoring the RDF type of each node there are - 406,717 unique nodes versus 406,331 unique nodes when ignoring the RDF type of each node. - - Args: - graph: An networkx.MultiDiGraph object. - - Returns: - stats: A formatted string containing descriptive statistics. - """ - - # derive statistics - nx_graph_und = graph.to_undirected() - nodes = networkx.number_of_nodes(graph); edges = networkx.number_of_edges(graph) - self_loops = networkx.number_of_selfloops(graph) - ce = sorted(Counter([str(x[2]) for x in graph.edges(keys=True)]).items(), # type: ignore - key=lambda x: x[1], reverse=1)[:6] # type: ignore - avg_degree = float(edges) / nodes - n_deg = sorted([(str(x[0]), x[1]) for x in graph.degree()], key=lambda x: x[1], reverse=1)[:6] # type: ignore - density = networkx.density(graph) - components = sorted(list(networkx.connected_components(nx_graph_und)), key=len, reverse=True) - cc_sizes = {x: len(components[x]) for x in range(len(components))} - x = '{} nodes, {} edges, {} self-loops, 5 most most common edges: {}, average degree {}, 5 highest degree '\ - 'nodes: {}, density: {}, {} component(s) and size(s): {}' - stats = 'Graph Stats: ' + x.format(nodes, edges, self_loops, ', '.join([x[0] + ':' + str(x[1]) for x in ce]), - avg_degree, ', '.join([x[0] + ':' + str(x[1]) for x in n_deg]), - density, len(components), cc_sizes) - - return stats - - def uploads_build_data(bucket, gcs_location) -> None: """Moves data from docker container to the dedicated Google Cloud Storage Bucket directory. @@ -188,25 +155,8 @@ def main(app, rel, owl): uploads_data_to_gcs_bucket(bucket, gcs_log_location, log_dir, log) # uploads log to gcs bucket ############################################################################# - # STEP 4 - PRINT BUILD STATISTICS - logs = 'STEP 4: DERIVING NETWORK STATISTICS FOR BUILD KNOWLEDGE GRAPHS'; print('\n' + logs); logger.info(logs) - - try: # find Networkx MultiDiGraph files in Google Cloud Storage Bucket for build - kg_owl = [f.name for f in bucket.list_blobs(prefix=gcs_current_loc_owl) if f.name.endswith('gpickle')] - kg_owlnets = [f.name for f in bucket.list_blobs(prefix=gcs_current_loc_owlnets) if f.name.endswith('gpickle')] - for f in set(kg_owl + kg_owlnets): - log_str = 'Loading graph data: {}'.format(f.split('/')[-1]); print(log_str); logger.info(log_str) - bucket_loc = gcs_current_loc_owlnets if 'OWLNETS' in f else gcs_current_loc_owl - nx_local_file = downloads_data_from_gcs_bucket(bucket, None, bucket_loc, f.split('/')[-1], '') - graph = networkx.read_gpickle(nx_local_file) - stats = derives_networkx_graph_statistics(graph); print(stats); logger.info(stats) - except: logger.error('ERROR: Uncaught Exception: {}'.format(traceback.format_exc())) - - uploads_data_to_gcs_bucket(bucket, gcs_log_location, log_dir, log) # uploads log to gcs bucket - - ############################################################################# - # STEP 5 - CLEAN UP BUILD ENVIRONMENT + LOG EXIT STATUS TO FINISH RUN - print('\nSTEP 5: BUILD CLEAN-UP'); logger.info('STEP 5: BUILD CLEAN-UP') + # STEP 4 - CLEAN UP BUILD ENVIRONMENT + LOG EXIT STATUS TO FINISH RUN + print('\nSTEP 4: BUILD CLEAN-UP'); logger.info('STEP 4: BUILD CLEAN-UP') runtime = round((datetime.now() - start_time).total_seconds() / 60, 3) print('\n\n' + '*' * 5 + ' COMPLETED BUILD PHASE 3: {} MINUTES '.format(runtime) + '*' * 5) logger.info('COMPLETED BUILD PHASE 3: {} MINUTES'.format(runtime)); logger.info('EXIT BUILD PHASE 3') @@ -218,12 +168,13 @@ def main(app, rel, owl): # owl build copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_archive_loc_owl, [log_1[0].split('/')[-1]]) copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_current_loc_owl, [log_1[0].split('/')[-1]]) - copies_data_between_gcs_bucket_directories(bucket, gcs_archive_loc_owl, gcs_current_loc_owl, [log]) + copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_archive_loc_owl, [log]) + copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_current_loc_owl, [log]) # owl-nets build copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_archive_loc_owlnets, [log_1[0].split('/')[-1]]) copies_data_between_gcs_bucket_directories(bucket, gcs_log_root, gcs_current_loc_owlnets, [log_1[0].split('/')[-1]]) - copies_data_between_gcs_bucket_directories(bucket, gcs_archive_loc_owl, gcs_archive_loc_owlnets, [log]) - copies_data_between_gcs_bucket_directories(bucket, gcs_current_loc_owl, gcs_current_loc_owlnets, [log]) + copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_archive_loc_owlnets, [log]) + copies_data_between_gcs_bucket_directories(bucket, gcs_log_location, gcs_current_loc_owlnets, [log]) # exit build uploads_data_to_gcs_bucket(bucket, gcs_log_location, log_dir, log) # uploads log to gcs bucket diff --git a/notebooks/Data_Preparation.ipynb b/notebooks/Data_Preparation.ipynb index 20753021..ebf9c88c 100644 --- a/notebooks/Data_Preparation.ipynb +++ b/notebooks/Data_Preparation.ipynb @@ -103,9 +103,9 @@ "metadata": {}, "outputs": [], "source": [ - "# # uncomment and update to install any required modules\n", + "# # uncomment and run to install any required modules from notebooks/requirements.txt\n", "# import sys\n", - "# !{sys.executable} -m pip install pkt_kg rdflib reactome2py" + "# !{sys.executable} -m pip install -r requirements.txt" ] }, { diff --git a/notebooks/OWLNETS_Example_Application.ipynb b/notebooks/OWLNETS_Example_Application.ipynb index 870ba653..a07a2db9 100644 --- a/notebooks/OWLNETS_Example_Application.ipynb +++ b/notebooks/OWLNETS_Example_Application.ipynb @@ -94,9 +94,9 @@ "metadata": {}, "outputs": [], "source": [ - "# # uncomment and update to install any required modules\n", + "# # uncomment and run to install any required modules from notebooks/requirements.txt\n", "# import sys\n", - "# !{sys.executable} -m pip install pkt_kg rdflib" + "# !{sys.executable} -m pip install -r requirements.txt" ] }, { diff --git a/notebooks/Ontology_Cleaning.ipynb b/notebooks/Ontology_Cleaning.ipynb index 19c6aa62..77aede48 100644 --- a/notebooks/Ontology_Cleaning.ipynb +++ b/notebooks/Ontology_Cleaning.ipynb @@ -178,9 +178,9 @@ "metadata": {}, "outputs": [], "source": [ - "# # uncomment and update to install any required modules\n", + "# # uncomment and run to install any required modules from notebooks/requirements.txt\n", "# import sys\n", - "# !{sys.executable} -m pip install glob pkt_kg pickle rdflib tqdm" + "# !{sys.executable} -m pip install -r requirements.txt" ] }, { diff --git a/notebooks/RDF_Graph_Processing_Example.ipynb b/notebooks/RDF_Graph_Processing_Example.ipynb index 8fc9e7d8..aa699373 100644 --- a/notebooks/RDF_Graph_Processing_Example.ipynb +++ b/notebooks/RDF_Graph_Processing_Example.ipynb @@ -61,9 +61,9 @@ "metadata": {}, "outputs": [], "source": [ - "# # uncomment and update to install any required modules\n", + "# # uncomment and run to install any required modules from notebooks/requirements.txt\n", "# import sys\n", - "# !{sys.executable} -m pip install networkx pkt_kg rdflib" + "# !{sys.executable} -m pip install -r requirements.txt" ] }, { @@ -860,7 +860,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.8" + "version": "3.6.2" } }, "nbformat": 4, diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt new file mode 100644 index 00000000..d81cb474 --- /dev/null +++ b/notebooks/requirements.txt @@ -0,0 +1,15 @@ +Cython>=0.29.14 +more-itertools +networkx +numpy>=1.18.1 +openpyxl>=3.0.3 +pandas>=1.0.5 +psutil +python-json-logger +ray +rdflib +reactome2py +requests +responses==0.10.12 +tqdm +urllib3 \ No newline at end of file diff --git a/pkt_kg/construction_approaches.py b/pkt_kg/construction_approaches.py index 53ac0644..ae0d828b 100644 --- a/pkt_kg/construction_approaches.py +++ b/pkt_kg/construction_approaches.py @@ -92,10 +92,10 @@ def maps_node_to_class(self, edge_type: str, entity: str) -> Optional[List]: non-class entity node is returned. """ - e_type = edge_type if entity not in self.subclass_dict.keys(): - if self.subclass_error and e_type in self.subclass_error.keys(): self.subclass_error[e_type] += [entity] - else: self.subclass_error[e_type] = [entity] + if self.subclass_error and edge_type in self.subclass_error.keys(): + if entity not in self.subclass_error[edge_type]: self.subclass_error[edge_type] += [entity] + else: self.subclass_error[edge_type] = [entity] subclass_map = None else: subclass_map = self.subclass_dict[entity] diff --git a/pkt_kg/knowledge_graph.py b/pkt_kg/knowledge_graph.py index 538c94fe..cc5feb9c 100644 --- a/pkt_kg/knowledge_graph.py +++ b/pkt_kg/knowledge_graph.py @@ -191,13 +191,16 @@ class EdgeConstructor(object): metadata: An instance of the metadata class with bound method needed for created edge metadata. ont_cls: A set of RDFLib URIRef terms representing all classes in the core merged ontologies. obj_props: A set of RDFLib URIRef terms representing all object properties in the core merged ontologies. - write_loc: An string passed specifying the primary directory to write to. + write_loc: A string passed specifying the primary directory to write to. """ def __init__(self, params) -> None: + self.clean_graph: Graph = Graph() self.construction: str = params.get('construction') self.edge_dict: dict = params.get('edge_dict') + self.error_dict: Dict = dict() + self.graph: Graph = Graph() self.kg_owl = params.get('kg_owl') self.inverse_relations_dict: Optional[Dict] = params.get('inverse_dict') self.node_data: Optional[str] = 'yes' if params.get('node_data') is not None else None @@ -205,15 +208,14 @@ def __init__(self, params) -> None: self.obj_properties: Set = params.get('obj_props') self.ont_classes: Set = params.get('ont_cls') self.relations_dict: Optional[Dict] = params.get('rel_dict') - self.write_location: str = params.get('write_loc') - self.error_dict: Dict = dict() - self.graph: Graph = Graph() self.res_dir: str = os.path.abspath('/'.join(params.get('write_loc').split('/')[:-1])) + self.write_location: str = params.get('write_loc') - def graph_getter(self) -> Graph: - """Methods returns inner class RDFLib Graph object.""" + def graph_getter(self) -> Tuple[Graph, Graph]: + """Methods returns two inner class RDFLib Graph objects the first contains pkt-namespaces and the second + contains the bnodes (anonymous nodes) with the pkt_namespace removed.""" - return self.graph + return self.graph, self.clean_graph def error_dict_getter(self) -> Dict: """Methods returns inner class subclass error dict object.""" @@ -295,8 +297,7 @@ def gets_edge_statistics(edge_type: str, results: Set, entity_info: List) -> str Args: edge_type: A string point to a specific edge type (e.g. 'chemical-disease). - results: A set of tuples representing the complete set of triples generated from the construction - process. + results: A set of tuples representing the complete set of triples from the construction process. entity_info: 3 items: 1-2 are sets of node tuples and 3 is the total count of non-OWL edges. Returns: @@ -329,8 +330,7 @@ def creates_new_edges(self, edge_type: str) -> Graph: invrel = self.checks_relations(rel, edge_list) if self.inverse_relations_dict is not None else None n1, n2, rels = set(), set(), 0; res: Set = set() # ; pbar = tqdm(total=len(edge_list)) while len(edge_list) > 0: - # pbar.update(1) - edge = edge_list.pop(0) + edge = edge_list.pop(0) # ; pbar.update(1) edge_info = {'n1': s, 'n2': o, 'rel': rel, 'inv_rel': invrel, 'uri': uri, 'edges': edge} meta = self.node_metadata_func(ent=[''.join(x) for x in list(zip(uri, edge))], e_type=[s, o]) meta_logic = [True if (self.node_data is None and meta is None) or [s, o] == ['class', 'class'] @@ -338,16 +338,14 @@ def creates_new_edges(self, edge_type: str) -> Graph: if self.checks_classes(edge_info) and meta_logic: if self.construction == 'subclass': edges = set(kg_bld.subclass_constructor(edge_info, edge_type)) else: edges = set(kg_bld.instance_constructor(edge_info, edge_type)) - cleaned_edges = updates_pkt_namespace_identifiers(edges, self.construction, False) - self.graph = adds_edges_to_graph(self.graph, cleaned_edges, False) res |= edges; n1 |= {edge[0]}; n2 |= {edge[1]}; rels = rels + 1 if invrel is None else rels + 2 - appends_to_existing_file(edges, logic) + self.graph = adds_edges_to_graph(self.graph, edges, False); appends_to_existing_file(edges, logic) if meta is not None: appends_to_existing_file(meta, anot) - # pbar.close() - stat = self.gets_edge_statistics(edge_type, res, [n1, n2, rels]); del [n1, n2, rels], res + cleaned_graph = updates_pkt_namespace_identifiers(edges, self.construction, False) + self.clean_graph = adds_edges_to_graph(self.clean_graph, cleaned_graph, False) + stat = self.gets_edge_statistics(edge_type, res, [n1, n2, rels]); del [n1, n2, rels], res # ; pbar.close() p = 'Created {} ({}-{}) Edges: {}'.format(edge_type.upper(), s, o, stat); print('\n' + p); logger.info(p) if len(kg_bld.subclass_error.keys()) > 0: self.error_dict = kg_bld.subclass_error - # self.graph = updates_pkt_namespace_identifiers(self.graph, self.construction) # remove bnode namespacing return None @@ -397,7 +395,7 @@ def construct_knowledge_graph(self) -> None: # STEP 4: CREATE GRAPH SUBSETS log_str = '*** Splitting Graph ***'; print(log_str); logger.info(log_str) f = self.write_location; self.graph, annotation_triples = splits_knowledge_graph(self.graph) - stats = 'Merged Logic Subset {}'.format(derives_graph_statistics(self.graph)); print(stats); logger.info(stats) + s = 'Merged Ontologies - Logic Subset {}'.format(derives_graph_statistics(self.graph)); print(s); logger.info(s) kg_owl = '_'.join(self.full_kg.split('_')[0:-1]) + '_OWL.owl' annot, logic, full = kg_owl[:-4] + '_AnnotationsOnly.nt', kg_owl[:-4] + '_LogicOnly.nt', kg_owl[:-4] + '.nt' appends_to_existing_file(annotation_triples, f + annot); appends_to_existing_file(self.graph, f + logic) @@ -412,12 +410,14 @@ def construct_knowledge_graph(self) -> None: args = {'construction': self.construct_approach, 'edge_dict': self.edge_dict, 'write_loc': self.write_location, 'rel_dict': self.relations_dict, 'inverse_dict': self.inverse_relations_dict, 'kg_owl': kg_owl, 'node_data': self.node_data, 'ont_cls': self.ont_classes, 'metadata': meta.creates_node_metadata, - 'obj_props': self.obj_properties}; edges = [x for x in self.edge_dict.keys()] + 'obj_props': self.obj_properties} + edges = sublist_creator({k: len(v['edge_list']) for k, v in self.edge_dict.items()}, self.cpus) actors = [ray.remote(self.EdgeConstructor).remote(args) for _ in range(self.cpus)] # type: ignore - for i in range(0, len(edges)): actors[i % self.cpus].creates_new_edges.remote(edges[i]) # type: ignore + for i in range(0, len(edges)): [actors[i].creates_new_edges.remote(j) for j in edges[i]] # type: ignore # extract results, aggregate actor dictionaries into single dictionary, and write data to json file _ = ray.wait([x.graph_getter.remote() for x in actors], num_returns=len(actors)) - graphs = [self.graph] + ray.get([x.graph_getter.remote() for x in actors]) + graph_res = ray.get([x.graph_getter.remote() for x in actors]) + graphs = [self.graph] + [x[0] for x in graph_res] # ; clean_graphs = [x[1] for x in graph_res] error_dicts = dict(ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors]))); del actors if len(error_dicts.keys()) > 0: # output error logs log_file = glob.glob(self.res_dir + '/construction*')[0] + '/subclass_map_log.json' @@ -487,15 +487,16 @@ def construct_knowledge_graph(self) -> None: kg_owl = '_'.join(self.full_kg.split('_')[0:-1]) + '_OWL.owl'; kg_owl_main = kg_owl[:-8] + '.owl' annot, logic, full = kg_owl[:-4] + '_AnnotationsOnly.nt', kg_owl[:-4] + '_LogicOnly.nt', kg_owl[:-4] + '.nt' appends_to_existing_file(annotation_triples, _ + annot); appends_to_existing_file(self.graph, _ + logic) - self.graph = updates_pkt_namespace_identifiers(self.graph, self.construct_approach); del annotation_triples + del annotation_triples # STEP 5: DECODE OWL SEMANTICS results = [set(self.graph), None, None] stats = 'Full Logic {}'.format(derives_graph_statistics(results[0])); print(stats); logger.info(stats) logger.info('*** Converting Knowledge Graph to Networkx MultiDiGraph ***') - convert_to_networkx(self.write_location, kg_owl[:-4], results[0]) - stats = derives_graph_statistics(results[0]); print(stats); logger.info(stats) + s = convert_to_networkx(self.write_location, kg_owl[:-4], results[0], True) + if s is not None: log_stats = 'Full Logic Subset (OWL) {}'.format(s); logger.info(log_stats); print(log_stats) if self.decode_owl: + self.graph = updates_pkt_namespace_identifiers(self.graph, self.construct_approach) owlnets = OwlNets(self.graph, self.write_location, kg_owl_main, self.construct_approach, self.owl_tools) results = [results[0]] + list(owlnets.runs_owlnets(self.cpus)) @@ -563,11 +564,11 @@ def construct_knowledge_graph(self) -> None: # STEP 4: CREATE GRAPH SUBSETS log_str = '*** Splitting Graph ***'; print(log_str); logger.info(log_str) f = self.write_location; self.graph, annotation_triples = splits_knowledge_graph(self.graph) - stats = 'Merged Logic Subset {}'.format(derives_graph_statistics(self.graph)); print(stats); logger.info(stats) + s = 'Merged Ontologies - Logic Subset {}'.format(derives_graph_statistics(self.graph)); print(s); logger.info(s) kg_owl = '_'.join(self.full_kg.split('_')[0:-1]) + '_OWL.owl'; kg_owl_main = kg_owl[:-8] + '.owl' annot, logic, full = kg_owl[:-4] + '_AnnotationsOnly.nt', kg_owl[:-4] + '_LogicOnly.nt', kg_owl[:-4] + '.nt' appends_to_existing_file(annotation_triples, f + annot); appends_to_existing_file(self.graph, f + logic) - self.graph = updates_pkt_namespace_identifiers(self.graph, self.construct_approach); del annotation_triples + del annotation_triples # STEP 5: ADD EDGE DATA TO KNOWLEDGE GRAPH DATA log_str = '*** Building Knowledge Graph Edges ***'; print('\n' + log_str); logger.info(log_str) @@ -577,23 +578,25 @@ def construct_knowledge_graph(self) -> None: args = {'construction': self.construct_approach, 'edge_dict': self.edge_dict, 'node_data': self.node_data, 'rel_dict': self.relations_dict, 'inverse_dict': self.inverse_relations_dict, 'kg_owl': kg_owl, 'ont_cls': self.ont_classes, 'obj_props': self.obj_properties, 'metadata': meta.creates_node_metadata, - 'write_loc': self.write_location}; edges = [x for x in self.edge_dict.keys()] + 'write_loc': self.write_location} + edges = sublist_creator({k: len(v['edge_list']) for k, v in self.edge_dict.items()}, self.cpus) actors = [ray.remote(self.EdgeConstructor).remote(args) for _ in range(self.cpus)] # type: ignore - for i in range(0, len(edges)): actors[i % self.cpus].creates_new_edges.remote(edges[i]) # type: ignore - # extract results, aggregate actor dictionaries into single dictionary, and write data to json file + for i in range(0, len(edges)): [actors[i].creates_new_edges.remote(j) for j in edges[i]] # type: ignore _ = ray.wait([x.graph_getter.remote() for x in actors], num_returns=len(actors)) - graphs = [self.graph] + ray.get([x.graph_getter.remote() for x in actors]); del self.edge_dict, self.graph + res = ray.get([x.graph_getter.remote() for x in actors]); g1 = [x[0] for x in res]; g2 = [x[1] for x in res] error_dicts = dict(ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors]))); del actors if len(error_dicts.keys()) > 0: # output error logs log_file = glob.glob(self.res_dir + '/construction*')[0] + '/subclass_map_log.json' logger.info('See log: {}'.format(log_file)); outputs_dictionary_data(error_dicts, log_file) # STEP 6: DECODE OWL SEMANTICS - results = [set(x for y in [set(x) for x in graphs] for x in y), None, None] + results = [set(x for y in [set(x) for x in [self.graph] + g1] for x in y), None, None] stats = 'Full Logic {}'.format(derives_graph_statistics(results[0])); print(stats); logger.info(stats) - logger.info('*** Converting Knowledge Graph to Networkx MultiDiGraph ***') - convert_to_networkx(self.write_location, kg_owl[:-4], results[0]) - if self.decode_owl: + s1 = convert_to_networkx(self.write_location, kg_owl[:-4], results[0], True) + if s1 is not None: log_stats = 'Full Logic Subset (OWL) {}'.format(s1); logger.info(log_stats); print(log_stats) + # aggregates processed owl-nets output derived when constructing non-ontology edges + if self.decode_owl is not None: + graphs = [updates_pkt_namespace_identifiers(self.graph, self.construct_approach)] + g2 owlnets = OwlNets(graphs, self.write_location, kg_owl_main, self.construct_approach, self.owl_tools) results = [results[0]] + list(owlnets.runs_owlnets(self.cpus)) @@ -603,7 +606,7 @@ def construct_knowledge_graph(self) -> None: for x in range(0, len(results)): graph = results[x]; p_str = 'OWL' if x == 0 else 'OWL-NETS' if x == 1 else 'Purified OWL-NETS' if graph is not None: - log_str = '*** Processing {} Graph ***'.format(p_str); print(log_str); logger.info(log_str) + log_str = '*** Processing {} Graph ***'.format(p_str); print('\n' + log_str); logger.info(log_str) triple_list_file = kg_owl[:-8] + f_prefix[x] + '_Triples_Integers.txt' triple_map = triple_list_file[:-5] + '_Identifier_Map.json' node_int_map = maps_ids_to_integers(graph, self.write_location, triple_list_file, triple_map) @@ -614,7 +617,8 @@ def construct_knowledge_graph(self) -> None: # deduplicate logic and annotation files, merge them, and print final stats deduplicates_file(f + annot); deduplicates_file(f + logic); merges_files(f + annot, f + logic, f + full) - graph = Graph().parse(f + full, format='nt') + str1 = '\nLoading Full (Logic + Annotation) Graph'; print('\n' + str1); logger.info(str1) + graph = Graph().parse(f + full, format='nt'); str2 = 'Deriving Stats'; print('\n' + str2); logger.info(str2) s = 'Full (Logic + Annotation) {}'.format(derives_graph_statistics(graph)); print('\n' + s); logger.info(s) return None diff --git a/pkt_kg/metadata.py b/pkt_kg/metadata.py index 87d5a03a..aaa042c9 100644 --- a/pkt_kg/metadata.py +++ b/pkt_kg/metadata.py @@ -152,11 +152,17 @@ def creates_node_metadata(self, ent: List, e_type: Optional[List] = None, key_ty """ key, edges, x = key_type, [], [] - if self.node_dict: - if key == 'relations' and e_type is None: x = [i for i in ent if i in self.node_dict[key].keys()] - elif e_type: x = [i for i in ent if e_type[ent.index(i)] != 'class' and i in self.node_dict[key].keys()] - else: pass - if len(x) > 0: # add metadata for eligible entities + if self.node_dict and isinstance(self.node_dict, Dict): + if key == 'nodes' and isinstance(e_type, List): + x = [i for i in ent if e_type[ent.index(i)] != 'class' and i in self.node_dict[key].keys()] + elif key == 'relations': x = [i for i in ent if i in self.node_dict[key].keys()] + else: return None + # check for matches + if (key == 'relations' and e_type is None) and len(x) == 0: return None + elif e_type == ['class', 'class'] and len(x) == 0: return None + elif (e_type == ['class', 'entity'] or e_type == ['entity', 'class']) and len(x) == 0: return None + elif e_type == ['entity', 'entity'] and len(x) != 2: return None + else: for i in x: metadata_info = self.node_dict[key][i] if 'Label' in metadata_info.keys(): @@ -170,7 +176,6 @@ def creates_node_metadata(self, ent: List, e_type: Optional[List] = None, key_ty for syn in metadata_info['Synonym'].split('|'): edges += [(URIRef(i), URIRef(oboinowl + 'hasSynonym'), Literal(syn))] return edges - else: return None else: return None def adds_ontology_annotations(self, filename: str, graph: Graph) -> Graph: @@ -235,7 +240,7 @@ def output_metadata(self, node_integer_map: Dict, graph: Union[Set, Graph]) -> N """ if self.node_dict: - log_str = 'Writing Class Metadata'; print('\n' + log_str); logger.info(log_str) + log_str = 'Writing Class Metadata'; print(log_str); logger.info(log_str) entities = set([i for j in tqdm(graph) for i in j]); filename = self.full_kg[:-4] + '_NodeLabels.txt' with open(self.write_location + filename, 'w', encoding='utf-8') as out: out.write('entity_type' + '\t' + 'integer_id' + '\t' + 'entity_uri' + '\t' + 'label' + '\t' + diff --git a/pkt_kg/owlnets.py b/pkt_kg/owlnets.py index 2d7a1a9c..40e2c469 100644 --- a/pkt_kg/owlnets.py +++ b/pkt_kg/owlnets.py @@ -82,9 +82,9 @@ def __init__(self, graph: Union[Graph, List, str], write_location: str, filename self.write_location = write_location self.res_dir = os.path.relpath('/'.join(self.write_location.split('/')[:-1])) self.filename = filename - self.top_level_ontologies: List = ['ISO', 'SUMO', 'BFO'] # can only appear as predicates - self.relations_ontologies: List = ['RO'] # can only appear as predicates - self.support_ontologies: List = ['IAO', 'SWO', 'OBI', 'UBPROP'] # can never appear in OWL-NETS triples + self.top_level: List = ['ISO', 'SUMO', 'BFO'] # can only appear as predicates + self.relations: List = ['RO'] # can only appear as predicates + self.support: List = ['IAO', 'SWO', 'OBI', 'UBPROP'] # can never appear in OWL-NETS triples # VERIFY INPUT GRAPH if not isinstance(graph, Graph) and not isinstance(graph, List) and not isinstance(graph, str): @@ -130,34 +130,33 @@ def removes_disjoint_with_axioms(self) -> None: return None - def removes_edges_with_owl_semantics(self) -> Graph: + def removes_edges_with_owl_semantics(self, verbose: bool = True) -> Graph: """Creates a filtered knowledge graph, such that only nodes that are owl:Class/owl:Individual connected via a owl:ObjectProperty and not an owl:AnnotationProperty. For example: - REMOVE - edges needed to support owl semantics (not biologically meaningful): - subject: obo:CLO_0037294 - predicate: owl:AnnotationProperty - object: rdf:about="http://purl.obolibrary.org/obo/CLO_0037294" + subject: obo:CLO_0037294; predicate: owl:AnnotationProperty; object: rdf:about=obo.CLO_0037294 KEEP - biologically meaningful edges: - subject: obo:CHEBI_16130 - predicate: obo:RO_0002606 - object: obo:HP_0000832 + subject: obo:CHEBI_16130; predicate: obo:RO_0002606; object: obo:HP_0000832 + + Args: + verbose: A bool indicating whether or not to print/log method use. Returns: filtered_graph: An RDFLib graph that contains only clinically and biologically meaningful triples. """ - log_str = 'Filtering Triples'; logger.info(log_str); print(log_str) + if verbose: log_str = 'Filtering Triples'; logger.info(log_str); print(log_str) - keep_predicates, filtered_triples = set(), set() - exclude = self.top_level_ontologies + self.relations_ontologies + self.support_ontologies - for x in tqdm(self.graph): + keep, filtered = set(), set(); exclude = self.top_level + self.relations + self.support + pbar = tqdm(total=len(self.graph)) if verbose else None + for x in self.graph: + if verbose: pbar.update(1) if isinstance(x[0], URIRef) and isinstance(x[1], URIRef) and isinstance(x[2], URIRef): # handle top-level, relation, and support ontologies (top/rel can only be rel; remove support onts) subj = not any(i for i in exclude if str(x[0]).split('/')[-1].startswith(i + '_')) obj = not any(i for i in exclude if str(x[2]).split('/')[-1].startswith(i + '_')) - rel = not any(i for i in self.support_ontologies if str(x[1]).split('/')[-1].startswith(i + '_')) + rel = not any(i for i in self.support if str(x[1]).split('/')[-1].startswith(i + '_')) if subj and obj and rel: s = [i for i in list(self.graph.triples((x[0], RDF.type, None))) if (OWL.Class in i[2] or OWL.NamedIndividual in i[2]) and '#' not in str(x[0])] @@ -166,41 +165,44 @@ def removes_edges_with_owl_semantics(self) -> Graph: p = [i for i in list(self.graph.triples((x[1], RDF.type, None))) if i[2] != OWL.AnnotationProperty] if len(s) > 0 and len(o) > 0 and len(p) > 0: - if OWL.ObjectProperty in [x[2] for x in p]: keep_predicates.add(x) - else: filtered_triples |= {x} + if OWL.ObjectProperty in [x[2] for x in p]: keep.add(x) + else: filtered |= {x} if len(s) > 0 and len(o) > 0 and len(p) == 0: - if RDFS.subClassOf in x[1]: keep_predicates.add(x) - elif RDF.type in x[1]: keep_predicates.add(x) - else: filtered_triples |= {x} - elif x[1] == RDFS.subClassOf and str(OWL) not in str(x[2]): keep_predicates.add(x) - else: filtered_triples |= {x} - else: filtered_triples |= {x} - else: filtered_triples |= {x} - filtered_graph = adds_edges_to_graph(Graph(), list(keep_predicates), False) - - self.owl_nets_dict['filtered_triples'] |= filtered_triples + if RDFS.subClassOf in x[1]: keep.add(x) + elif RDF.type in x[1]: keep.add(x) + else: filtered |= {x} + elif x[1] == RDFS.subClassOf and str(OWL) not in str(x[2]): keep.add(x) + else: filtered |= {x} + else: filtered |= {x} + else: filtered |= {x} + if verbose: pbar.close() + filtered_graph = adds_edges_to_graph(Graph(), list(keep), False) + + self.owl_nets_dict['filtered_triples'] |= filtered return filtered_graph - def cleans_decoded_graph(self) -> Graph: + def cleans_decoded_graph(self, verbose: bool = True) -> Graph: """Creates a filtered knowledge graph, such that only nodes that are owl:Class/owl:Individual connected via a owl:ObjectProperty and not an owl:AnnotationProperty. This method is a reduced version of the removes_edges_with_owl_semantics method, which is meant to be applied to a graph after it's been decoded. + Args: + verbose: A bool indicating whether or not to print/log progress. + Returns: filtered_graph: An RDFLib graph that contains only clinically and biologically meaningful triples. """ - log_str = 'Filtering Triples'; logger.info(log_str); print(log_str) + if verbose: log_str = 'Filtering Triples'; logger.info(log_str); print(log_str) - keep_predicates, filtered_triples = set(), set() - exclude = self.top_level_ontologies + self.relations_ontologies + self.support_ontologies + keep_predicates, filtered_triples = set(), set(); exclude = self.top_level + self.relations + self.support for x in self.graph: if isinstance(x[0], URIRef) and isinstance(x[1], URIRef) and isinstance(x[2], URIRef): # handle top-level, relation, and support ontologies (top/rel can only be rel; remove support onts) subj = not any(i for i in exclude if str(x[0]).split('/')[-1].startswith(i + '_')) obj = not any(i for i in exclude if str(x[2]).split('/')[-1].startswith(i + '_')) - rel = not any(i for i in self.support_ontologies if str(x[1]).split('/')[-1].startswith(i + '_')) + rel = not any(i for i in self.support if str(x[1]).split('/')[-1].startswith(i + '_')) if subj and obj and rel: if str(OWL) not in str(x[0]) and str(OWL) not in str(x[2]): keep_predicates.add(x) else: filtered_triples |= {x} @@ -582,22 +584,23 @@ class (referenced by node) in order to remove owl-encoded information. An exampl return cleaned, results[1] else: return cleaned, axioms - def cleans_owl_encoded_entities(self, node_list: List) -> None: + def cleans_owl_encoded_entities(self, node_list: List, verbose: bool = True) -> None: """Loops over a all owl:Class and owl: Axiom objects and decodes the OWL semantics returning the corresponding triples for each type without OWL semantics. Args: node_list: A list of owl:Class and owl:Axiom entities to decode. + verbose: A bool indicating whether or not to print/log progress. Returns: None. """ - log_str = 'Decoding {} OWL Classes and Axioms'.format(len(node_list)); logger.info(log_str); print(log_str) + if verbose: s = 'Decoding {} OWL Classes and Axioms'.format(len(node_list)); logger.info(s); print(s) - decoded_graph: Graph = Graph(); cleaned_entities: Set = set() # ; pbar = tqdm(total=len(self.node_list)) + decoded_graph: Graph = Graph(); cleaned_entities: Set = set() # ; pbar = tqdm(total=len(node_list)) while node_list: - # pbar.update(1); + # pbar.update(1) node = node_list.pop(0); node_info = self.creates_edge_dictionary(node) if node_info is not None and len(node_info[1]) != 0: self.captures_cardinality_axioms(node_info[2], node) @@ -627,7 +630,7 @@ def cleans_owl_encoded_entities(self, node_list: List) -> None: edges = None; self.owl_nets_dict['misc'][n3(node)] = {tuple(misc)} decoded_graph = adds_edges_to_graph(decoded_graph, list(cleaned_classes), False) self.owl_nets_dict['decoded_entities'][n3(node)] = cleaned_classes - self.graph = decoded_graph; self.graph = self.cleans_decoded_graph() # ; pbar.close() + self.graph = decoded_graph; self.graph = self.cleans_decoded_graph(verbose) # ; pbar.close() return None @@ -671,7 +674,7 @@ def makes_graph_connected(self, graph: Graph, common_ancestor: Union[URIRef, str needed_triples = set((URIRef(x), rel, anc_node) for x in roots if x != anc_node) graph = adds_edges_to_graph(graph, needed_triples, False) - logs = '{} triples added to make connected.'.format(len(needed_triples)); logger.info(logs); print(logs) + logs = '{} triples added to make connected'.format(len(needed_triples)); logger.info(logs); print(logs) return graph @@ -733,7 +736,8 @@ def write_out_results(self, graph: Union[Set, Graph], kg_const: Optional[str] = # write out owl_nets dictionary with open(self.write_location + f_name.strip('.nt') + '_decoding_dict.pkl', 'wb') as out: pickle.dump(self.owl_nets_dict, out) - convert_to_networkx(self.write_location, f_name.strip('.nt'), graph) + s = convert_to_networkx(self.write_location, f_name.strip('.nt'), graph, True) + if s is not None: log_stats = '{}OWL-NETS {}'.format(personalize, s); logger.info(log_stats); print(log_stats) return None @@ -778,13 +782,13 @@ def runs_owlnets(self, cpus: int = 1) -> Tuple: if self.kg_construct_approach is not None: graph2 = set(self.purifies_graph_build(conn_graph)); g2 = derives_graph_statistics(graph2) self.write_out_results(graph2, self.kg_construct_approach) - stats = 'OWL-NETS {};\nPurified OWL-NETS {}'.format(g1, g2); print(stats); logger.info(stats) + stats = '\n\nOWL-NETS {};\nPurified OWL-NETS {}'.format(g1, g2); print(stats); logger.info(stats) # process owl decoding results - for key in self.owl_nets_dict.keys(): - if not isinstance(self.owl_nets_dict[key], Set): value = dict(ChainMap(*[d[key] for d in res2])) - else: value = self.owl_nets_dict[key] | set(ChainMap(*[d[key] for d in res2])) - self.owl_nets_dict[key] = value + for k in self.owl_nets_dict.keys(): + if not isinstance(self.owl_nets_dict[k], Set): + self.owl_nets_dict[k].update(dict(ChainMap(*[d[k] for d in res2]))) + else: self.owl_nets_dict[k] = self.owl_nets_dict[k] | set(ChainMap(*[d[k] for d in res2])) str1 = 'Decoded {} owl-encoded classes and axioms. Note the following:\nPartially processed {} cardinality ' \ 'elements\nRemoved {} owl:disjointWith axioms\nIgnored: {} misc classes; {} classes constructed with ' \ 'owl:complementOf; {} classes containing negation (e.g. pr#lacks_part, cl#has_not_completed)\n' \ diff --git a/pkt_kg/utils/__init__.py b/pkt_kg/utils/__init__.py index 52abdc02..23fa9ca0 100644 --- a/pkt_kg/utils/__init__.py +++ b/pkt_kg/utils/__init__.py @@ -15,4 +15,4 @@ 'connected_components', 'removes_self_loops', 'derives_graph_statistics', 'splits_knowledge_graph', 'adds_namespace_to_bnodes', 'removes_namespace_from_bnodes', 'updates_pkt_namespace_identifiers', 'finds_node_type', 'updates_graph_namespace', 'maps_ids_to_integers', 'n3', 'appends_to_existing_file', - 'deduplicates_file', 'merges_files', 'convert_to_networkx'] + 'deduplicates_file', 'merges_files', 'convert_to_networkx', 'sublist_creator'] diff --git a/pkt_kg/utils/data_utils.py b/pkt_kg/utils/data_utils.py index 8b446d0c..f772fa90 100644 --- a/pkt_kg/utils/data_utils.py +++ b/pkt_kg/utils/data_utils.py @@ -22,6 +22,7 @@ * genomic_id_mapper * deduplicates_file * merges_files +* sublist_creator Outputs data * outputs_dictionary_data @@ -30,6 +31,7 @@ # import needed libraries import ftplib import gzip +import heapq import json import numpy as np # type: ignore import os @@ -407,12 +409,11 @@ def outputs_dictionary_data(dict_object: Optional[Dict], filename: str) -> None: return None -def deduplicates_file(src_filepath: str, dest_filepath: Optional[str] = None) -> None: +def deduplicates_file(src_filepath: str) -> None: """Removes duplicates from a file. Args: src_filepath: A string specifying a path to an existing file. - dest_filepath: A string specifying a path to write deduplicated file to (default=src_filepath). Returns: None. @@ -420,14 +421,12 @@ def deduplicates_file(src_filepath: str, dest_filepath: Optional[str] = None) -> print('Depduplicating File: {}'.format(src_filepath)) - temp_data = None - if dest_filepath is None: - temp_data = '/'.join(src_filepath.split('/')[:-1]) + '/test.nt' - os.rename(src_filepath, temp_data) - dest_filepath, src_filepath = src_filepath, temp_data - - os.system('sort {} | uniq > {}'.format(src_filepath, dest_filepath)) - if temp_data and os.path.exists(temp_data): os.remove(temp_data) + lines = list(set(open(src_filepath, 'r').readlines())); pbar = tqdm(total=len(lines)) + with open(src_filepath, 'w') as f: + while len(lines) > 0: + x = lines.pop(); pbar.update(1) + f.write(x) if x.endswith('\n') else f.write(x + '\n') + pbar.close() return None @@ -449,3 +448,34 @@ def merges_files(filepath1: str, filepath2: str, merged_filepath: str) -> None: os.system('( cat {} ; echo ''; cat {} ) > {}'.format(filepath1, filepath2, merged_filepath)) return None + + +def sublist_creator(actors: Union[Dict, List], chunk_size: int) -> List: + """Takes a list of lists and returns sublists, where the sublists are balanced according to their length. + + SOURCE: https://stackoverflow.com/questions/61648065 + + Args: + actors: A list or a dictionary keyed by edge identifier with the length of each associated edge list + stored as the values. + chunk_size: An integer specifying the number of sublists that should be returned. + + Returns: + updated_lists: A list of lists, where the inner lists have been balanced by their size. + """ + + if isinstance(actors, Dict): values = sorted(list(actors.values()), reverse=True) + else: values = sorted(actors, reverse=True) + lists: List = [[] for _ in range(chunk_size)]; totals = [(0, i) for i in range(chunk_size)]; heapq.heapify(totals) + for value in values: + total, index = heapq.heappop(totals); lists[index].append(value); heapq.heappush(totals, (total + value, index)) + + # update list to return string identifier associated with each list length + if isinstance(actors, Dict): + updated_lists = []; used_ids = set() + for sub in lists: + sub_list = [[k for k, v in actors.items() if v == x and k not in used_ids][0] for x in sub] + updated_lists += [sub_list]; used_ids |= set(x for y in sub_list for x in y) + else: updated_lists = lists + + return updated_lists diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index ce4a05a4..4e63e3ba 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -365,7 +365,7 @@ def gets_entity_ancestors(graph: Graph, uris: List[Union[URIRef, str]], rel: Uni return gets_entity_ancestors(graph, uris, prop, cls_lst) -def connected_components(graph: Graph) -> List: +def connected_components(graph: Union[Graph, Set]) -> List: """Creates a dictionary where the keys are integers representing a component number and the values are sets containing the nodes for a given component. This method works by first converting the RDFLib graph into a NetworkX multi-directed graph, which is converted to a undirected graph prior to calculating the connected @@ -375,17 +375,15 @@ def connected_components(graph: Graph) -> List: graph: An RDFLib Graph object. Returns: - component_dict: A dictionary where the keys are integers representing a component number and the values are sets - containing the nodes for a given component. + components: A list of the nodes in each component detected in the graph. """ nx_mdg = nx.MultiDiGraph() - for s, p, o in tqdm(graph): - nx_mdg.add_edge(s, o, **{'key': p}) + for s, p, o in tqdm(graph): nx_mdg.add_edge(s, o, **{'key': p}) print('Calculating Connected Components') - comps = list(nx.connected_components(nx_mdg.to_undirected())); component_dict = sorted(comps, key=len, reverse=True) + components = list(nx.connected_components(nx_mdg.to_undirected())) - return component_dict + return components def removes_self_loops(graph: Graph) -> List: @@ -686,7 +684,7 @@ def n3(node: Union[URIRef, BNode, Literal]) -> str: return serialized_node -def convert_to_networkx(write_location: str, full_kg: str, graph: Optional[Union[Graph, Set]] = None) -> None: +def convert_to_networkx(write_loc: str, filename: str, graph: Union[Graph, Set], stats: bool = False) -> Optional[str]: """Converts an RDFLib.Graph object into a Networkx MultiDiGraph and pickles a copy locally. Each node is provided a key that is the URI identifier and each edge is given a key which is an md5 hash of the triple and a weight of 0.0. An example of the output is shown below. The md5 hash is meant to store a unique key that represents that @@ -703,33 +701,26 @@ def convert_to_networkx(write_location: str, full_kg: str, graph: Optional[Union - edge data: [(obo.SO_0000288, obo.SO_0000287', {'predicate_key': '9cbd4826291e7b38eb', 'weight': 0.0})] Args: - write_location: A string pointing to a local directory for writing data. - full_kg: A string containing the subdirectory and name of the the knowledge graph file. + write_loc: A string pointing to a local directory for writing data. + filename: A string containing the subdirectory and name of the the knowledge graph file. graph: An RDFLib Graph object or set of RDFLib Graph triples. + stats: A bool indicating whether or not to derive network statistics after writing networkx file to disk. Returns: - None. - - Raises: - IOError: If the file referenced by filename does not exist. + network_stats: A string containing network statistics information. """ print('Converting Knowledge Graph to MultiDiGraph') - if graph is None: - file_type = 'xml' if 'OWLNETS' not in full_kg else full_kg.split('.')[-1] - ext = '.owl' if file_type == 'xml' else '.nt' - graph = Graph().parse(write_location + full_kg + ext, format=file_type) - nx_mdg = nx.MultiDiGraph() for s, p, o in tqdm(graph): pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest() nx_mdg.add_node(s, key=n3(s)); nx_mdg.add_node(o, key=n3(o)) nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight': 0.0}) print('Pickling MultiDiGraph') - nx.write_gpickle(nx_mdg, write_location + full_kg + '_NetworkxMultiDiGraph.gpickle'); del nx_mdg - - return None + nx.write_gpickle(nx_mdg, write_loc + filename + '_NetworkxMultiDiGraph.gpickle') + if stats: print('Generating Network Statistics'); return derives_graph_statistics(nx_mdg) + else: return None def appends_to_existing_file(edges: Union[List, Set, Graph], filepath: str, sep: str = ' ') -> None: diff --git a/resources/edge_source_list.txt b/resources/edge_source_list.txt index cd959bc5..2bf9358e 100644 --- a/resources/edge_source_list.txt +++ b/resources/edge_source_list.txt @@ -6,7 +6,6 @@ chemical-gomf, https://storage.googleapis.com/pheknowlator/current_build/data/or chemical-pathway, https://storage.googleapis.com/pheknowlator/current_build/data/original_data/ChEBI2Reactome_All_Levels.txt chemical-phenotype, https://storage.googleapis.com/pheknowlator/current_build/data/original_data/CTD_chemicals_diseases.tsv chemical-protein, https://storage.googleapis.com/pheknowlator/current_build/data/original_data/CTD_chem_gene_ixns.tsv -chemical-rna, https://storage.googleapis.com/pheknowlator/current_build/data/original_data/CTD_chem_gene_ixns.tsv disease-phenotype, https://storage.googleapis.com/pheknowlator/current_build/data/original_data/phenotype_annotation.tab gene-disease, https://storage.googleapis.com/pheknowlator/current_build/data/original_data/curated_gene_disease_associations.tsv gene-gene, https://storage.googleapis.com/pheknowlator/current_build/data/original_data/COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt diff --git a/resources/resource_info.txt b/resources/resource_info.txt index 6678ed7e..e7fe4d7a 100644 --- a/resources/resource_info.txt +++ b/resources/resource_info.txt @@ -6,7 +6,6 @@ chemical-gomf|:;MESH_;GO_|class-class|RO_0002436|http://purl.obolibrary.org/obo/ chemical-pathway|;CHEBI_;|class-entity|RO_0000056|http://purl.obolibrary.org/obo/|https://reactome.org/content/detail/|t|0;1|None|None|5;==;Homo sapiens chemical-phenotype|:;MESH_;|class-class|RO_0002606|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/MESH_CHEBI_MAP.txt;1:./resources/processed_data/PHENOTYPE_HPO_MAP.txt|5;!=;''|None chemical-protein|;MESH_;|class-class|RO_0002434|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/MESH_CHEBI_MAP.txt;1:./resources/processed_data/ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt|9;affects;not in x|6;==;Homo sapiens::5;.startswith('protein'); -chemical-rna|;MESH_;|class-entity|RO_0002434|http://purl.obolibrary.org/obo/|https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=|t|1;4|0:./resources/processed_data/MESH_CHEBI_MAP.txt;1:./resources/processed_data/ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt|9;activity;not in x::9;affects;not in x::9;reaction;not in x|6;==;Homo sapiens::5;.startswith('mRNA'); disease-phenotype|:;;HP_|class-class|RO_0002200|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/DISEASE_MONDO_MAP.txt|None|None|None gene-disease|;;|entity-class|RO_0003302|http://www.ncbi.nlm.nih.gov/gene/|http://purl.obolibrary.org/obo/|t|0;4|1:./resources/processed_data/DISEASE_MONDO_MAP.txt|10;>=;1.0|None gene-gene|;;|entity-entity|RO_0002435|http://www.ncbi.nlm.nih.gov/gene/|http://www.ncbi.nlm.nih.gov/gene/|t|0;1|0:./resources/processed_data/ENSEMBL_GENE_ENTREZ_GENE_MAP.txt;1:./resources/processed_data/ENSEMBL_GENE_ENTREZ_GENE_MAP.txt|2;>=;0.00019|None diff --git a/tests/test_data_utils_miscellaneous.py b/tests/test_data_utils_miscellaneous.py index 1314ca48..32182f31 100644 --- a/tests/test_data_utils_miscellaneous.py +++ b/tests/test_data_utils_miscellaneous.py @@ -5,6 +5,7 @@ import unittest from tqdm import tqdm +from typing import List from pkt_kg.utils import * @@ -88,23 +89,6 @@ def test_outputs_dictionary_data(self): return None def test_deduplicates_file(self): - """Tests the deduplicates_file method.""" - - data_dir = os.path.dirname(__file__) - src_filepath = data_dir + '/data/test_file.nt' - dest_filepath = data_dir + '/data/test_file_cleaned.nt' - deduplicates_file(src_filepath, dest_filepath) - - # test method - with open(dest_filepath) as f: data = f.readlines() - self.assertTrue(len(data) == 4) - - # clean up environment - if os.path.exists(dest_filepath): os.remove(dest_filepath) - - return None - - def test_deduplicates_file_only_src(self): """Tests the deduplicates_file method when a destination location is not provided.""" data_dir = os.path.dirname(__file__) @@ -114,7 +98,7 @@ def test_deduplicates_file_only_src(self): # test method with open(src_filepath) as f: data = f.readlines() - self.assertTrue(len(data) == 4) + self.assertTrue(len(data) == 5) # clean up environment if os.path.exists(src_filepath): os.remove(src_filepath) @@ -139,6 +123,37 @@ def test_merges_files(self): return None + def tests_sublist_creator_dict(self): + """Tests the sublist_creator method when the input is a dictionary.""" + + actors = {'protein-cell': 75308, 'protein-cofactor': 1994, 'variant-disease': 35686, 'rna-anatomy': 444974, + 'protein-catalyst': 24311, 'chemical-protein': 64330, 'chemical-gene': 16695, 'protein-gobp': 137926, + 'protein-pathway': 114807, 'protein-anatomy': 30677, 'chemical-pathway': 28357, 'gene-gene': 23525} + lists = sublist_creator(actors, 5) + + self.assertIsInstance(lists, List) + self.assertTrue(len(lists), 5) + self.assertEqual(lists, + [['rna-anatomy'], ['protein-gobp'], ['protein-pathway', 'gene-gene'], + ['protein-cell', 'protein-anatomy', 'protein-catalyst', 'protein-cofactor'], + ['chemical-protein', 'variant-disease', 'chemical-pathway', 'chemical-gene']]) + + return None + + def tests_sublist_creator_list(self): + """Tests the sublist_creator method when the input is a dictionary.""" + + actors = [75308, 1994, 35686, 444974, 24311, 64330, 16695, 137926, 114807, 30677, 28357, 23525] + lists = sublist_creator(actors, 5) + + self.assertIsInstance(lists, List) + self.assertTrue(len(lists), 5) + self.assertEqual(lists, + [[444974], [137926], [114807, 23525], + [75308, 30677, 24311, 1994], [64330, 35686, 28357, 16695]]) + + return None + def tearDown(self): # remove temp directory diff --git a/tests/test_kg_utils.py b/tests/test_kg_utils.py index 7aec174f..f30f546f 100644 --- a/tests/test_kg_utils.py +++ b/tests/test_kg_utils.py @@ -305,13 +305,15 @@ def test_convert_to_networkx(self): """Tests the convert_to_networkx method.""" # check that files were created - convert_to_networkx(write_location=self.dir_loc, full_kg='/so_with_imports', graph=None) - self.assertTrue(os.path.exists(self.dir_loc + '/so_with_imports_NetworkxMultiDiGraph.gpickle')) + graph = Graph().parse(self.good_ontology_file_location) + stats = convert_to_networkx(self.dir_loc, '/so_with_imports', graph, True) + print(stats) # load graph and check structure s = obo.SO_0000288; o = obo.SO_0000287; p = RDFS.subClassOf graph = nx.read_gpickle(self.dir_loc + '/so_with_imports_NetworkxMultiDiGraph.gpickle') self.assertEqual(graph[s][o][p], {'predicate_key': '72908c671b9244c1a1dc2b36e4708f15', 'weight': 0.0}) + self.assertIsInstance(stats, str) # clean up the environment os.remove(self.dir_loc + '/so_with_imports_NetworkxMultiDiGraph.gpickle') diff --git a/tests/test_knowledge_graph.py b/tests/test_knowledge_graph.py index e7ef6940..642e33dd 100644 --- a/tests/test_knowledge_graph.py +++ b/tests/test_knowledge_graph.py @@ -13,14 +13,16 @@ from collections import ChainMap from mock import patch -from rdflib import Graph, URIRef, BNode -from rdflib.namespace import OWL, RDF +from rdflib import Graph, URIRef, BNode, Namespace +from rdflib.namespace import OWL, RDF, RDFS from typing import Dict, List from pkt_kg.__version__ import __version__ from pkt_kg.knowledge_graph import FullBuild, PartialBuild, PostClosureBuild from pkt_kg.metadata import Metadata -from pkt_kg.utils import appends_to_existing_file, gets_ontology_classes, gets_object_properties, splits_knowledge_graph +from pkt_kg.utils import * + +obo = Namespace('http://purl.obolibrary.org/obo/') class TestKGBuilder(unittest.TestCase): @@ -76,10 +78,10 @@ def setUp(self): "uri": ["http://www.ncbi.nlm.nih.gov/gene/", "http://purl.obolibrary.org/obo/"], "edge_list": [["2", "SO_0000162"], ["2", "SO_0000196"], - ["2", "SO_0000323"], ["9", "SO_0001490"], - ["9", "SO_0000301"], ["9", "SO_0001560"], - ["9", "SO_0001560"], ["10", "SO_0000444"], - ["10", "SO_0002138"], ["10", "SO_0000511"]]}, + ["3", "SO_0000323"], ["9", "SO_0001490"], + ["10", "SO_0000301"], ["11", "SO_0001560"], + ["12", "SO_0001560"], ["17", "SO_0000444"], + ["18", "SO_0002138"], ["20", "SO_0000511"]]}, "gene-gene": {"data_type": "entity-entity", "edge_relation": "RO_0002435", "uri": ["http://www.ncbi.nlm.nih.gov/gene/", @@ -103,10 +105,10 @@ def setUp(self): "uri": ["http://www.ncbi.nlm.nih.gov/gene/", "http://purl.obolibrary.org/obo/"], "edge_list": [["2", "SO_0000162"], ["2", "SO_0000196"], - ["2", "SO_0000323"], ["9", "SO_0001490"], - ["9", "SO_0000301"], ["9", "SO_0001560"], - ["9", "SO_0001560"], ["10", "SO_0000444"], - ["10", "SO_0002138"], ["10", "SO_0000511"]]}, + ["3", "SO_0000323"], ["9", "SO_0001490"], + ["10", "SO_0000301"], ["11", "SO_0001560"], + ["12", "SO_0001560"], ["17", "SO_0000444"], + ["18", "SO_0002138"], ["19", "SO_0000511"]]}, "gene-gene": {"data_type": "entity-entity", "edge_relation": "RO_0002435", "uri": ["http://www.ncbi.nlm.nih.gov/gene/", @@ -511,6 +513,28 @@ def test_checks_relations(self): return None + def test_gets_edge_statistics(self): + """Tests the gets_edge_statistics method.""" + + # no inverse edges + edges = [(1, 2, 3), (3, 2, 5), (4, 6, 7)] + stats = self.inner_class.gets_edge_statistics('gene-gene', edges, [{1, 2, 3}, {1, 2, 3}, 8]) + expected_str = '3 OWL Edges, 8 Original Edges; 5 OWL Nodes, Original Nodes: 3 gene(s), 3 gene(s)' + self.assertEqual(stats, expected_str) + + return None + + def test_gets_edge_statistics_inverse_relations(self): + """Tests the gets_edge_statistics method when including inverse relations.""" + + # no inverse edges + edges = [(1, 2, 3), (3, 2, 5), (4, 6, 7)] + stats = self.inner_class.gets_edge_statistics('drug-gene', edges, [{1, 2, 3}, {1, 2, 3}, 8]) + expected_str = '3 OWL Edges, 8 Original Edges; 5 OWL Nodes, Original Nodes: 3 drug(s), 3 gene(s)' + self.assertEqual(stats, expected_str) + + return None + def test_creates_new_edges_not_adding_metadata_to_kg(self): """Tests the creates_new_edges method without adding node metadata to the KG.""" @@ -531,6 +555,8 @@ def test_creates_new_edges_not_adding_metadata_to_kg(self): full_kg_owl = '_'.join(self.kg_subclass.full_kg.split('_')[0:-1]) + '_OWL.owl' annot, full = full_kg_owl[:-4] + '_AnnotationsOnly.nt', full_kg_owl[:-4] + '.nt' appends_to_existing_file(annotation_triples, self.kg_subclass.write_location + annot, ' ') + clean_graph = updates_pkt_namespace_identifiers(self.kg_subclass.graph, self.kg_subclass.construct_approach) + # test method shutil.copy(self.kg_subclass.write_location + annot, self.kg_subclass.write_location + full) appends_to_existing_file(set(self.kg_subclass.graph), self.kg_subclass.write_location + full, ' ') @@ -543,13 +569,16 @@ def test_creates_new_edges_not_adding_metadata_to_kg(self): ray.init(local_mode=True, ignore_reinit_error=True) actors = [ray.remote(self.kg_subclass.EdgeConstructor).remote(args) for _ in range(self.kg_subclass.cpus)] for i in range(0, len(edges)): actors[i % self.kg_subclass.cpus].creates_new_edges.remote(edges[i]) - graphs = [self.kg_subclass.graph] + ray.get([x.graph_getter.remote() for x in actors]) + res = ray.get([x.graph_getter.remote() for x in actors]) + g1 = [self.kg_subclass.graph] + [x[0] for x in res]; g2 = [clean_graph] + [x[1] for x in res] error_dicts = dict(ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors]))); del actors ray.shutdown() # check that edges were added to the graph - self.kg_subclass.graph = set(x for y in [set(x) for x in graphs] for x in y) - self.assertTrue(len(self.kg_subclass.graph) > 0) + graph1 = set(x for y in [set(x) for x in g1] for x in y) + graph2 = set(x for y in [set(x) for x in g2] for x in y) + self.assertEqual(len(graph1), 9820) + self.assertEqual(len(graph2), 9774) self.assertIsInstance(error_dicts, Dict) # check graph files were saved f_name = full_kg_owl[:-4] + '_AnnotationsOnly.nt' @@ -576,6 +605,7 @@ def test_creates_new_edges_adding_metadata_to_kg(self): full_kg_owl = '_'.join(self.kg_subclass.full_kg.split('_')[0:-1]) + '_OWL.owl' annot, full = full_kg_owl[:-4] + '_AnnotationsOnly.nt', full_kg_owl[:-4] + '.nt' appends_to_existing_file(annotation_triples, self.kg_subclass.write_location + annot, ' ') + clean_graph = updates_pkt_namespace_identifiers(self.kg_subclass.graph, self.kg_subclass.construct_approach) # test method shutil.copy(self.kg_subclass.write_location + annot, self.kg_subclass.write_location + full) appends_to_existing_file(set(self.kg_subclass.graph), self.kg_subclass.write_location + full, ' ') @@ -588,13 +618,16 @@ def test_creates_new_edges_adding_metadata_to_kg(self): ray.init(local_mode=True, ignore_reinit_error=True) actors = [ray.remote(self.kg_subclass.EdgeConstructor).remote(args) for _ in range(self.kg_subclass.cpus)] for i in range(0, len(edges)): actors[i % self.kg_subclass.cpus].creates_new_edges.remote(edges[i]) - graphs = [self.kg_subclass.graph] + ray.get([x.graph_getter.remote() for x in actors]) + res = ray.get([x.graph_getter.remote() for x in actors]) + g1 = [self.kg_subclass.graph] + [x[0] for x in res]; g2 = [clean_graph] + [x[1] for x in res] error_dicts = dict(ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors]))); del actors ray.shutdown() # check that edges were added to the graph - self.kg_subclass.graph = set(x for y in [set(x) for x in graphs] for x in y) - self.assertTrue(len(self.kg_subclass.graph) > 0) + graph1 = set(x for y in [set(x) for x in g1] for x in y) + graph2 = set(x for y in [set(x) for x in g2] for x in y) + self.assertEqual(len(graph1), 9780) + self.assertEqual(len(graph2), 9746) self.assertIsInstance(error_dicts, Dict) # check graph files were saved f_name = full_kg_owl[:-4] + '_AnnotationsOnly.nt' @@ -604,35 +637,13 @@ def test_creates_new_edges_adding_metadata_to_kg(self): return None - def test_gets_edge_statistics(self): - """Tests the gets_edge_statistics method.""" - - # no inverse edges - edges = [(1, 2, 3), (3, 2, 5), (4, 6, 7)] - stats = self.inner_class.gets_edge_statistics('gene-gene', edges, [{1, 2, 3}, {1, 2, 3}, 8]) - expected_str = '3 OWL Edges, 8 Original Edges; 5 OWL Nodes, Original Nodes: 3 gene(s), 3 gene(s)' - self.assertEqual(stats, expected_str) - - return None - - def test_gets_edge_statistics_inverse_relations(self): - """Tests the gets_edge_statistics method when including inverse relations.""" - - # no inverse edges - edges = [(1, 2, 3), (3, 2, 5), (4, 6, 7)] - stats = self.inner_class.gets_edge_statistics('drug-gene', edges, [{1, 2, 3}, {1, 2, 3}, 8]) - expected_str = '3 OWL Edges, 8 Original Edges; 5 OWL Nodes, Original Nodes: 3 drug(s), 3 gene(s)' - self.assertEqual(stats, expected_str) - - return None - def test_creates_new_edges_instance_no_inverse(self): """Tests the creates_new_edges method when applied to a kg with instance-based construction without inverse relations.""" self.kg_instance.reverse_relation_processor() # make sure that kg is empty - self.kg_instance.graph = Graph() + self.kg_instance.graph = Graph().parse(self.dir_loc + '/ontologies/so_with_imports.owl') # initialize metadata class meta = Metadata(self.kg_instance.kg_version, self.kg_instance.write_location, self.kg_instance.full_kg, self.kg_instance.node_data, self.kg_instance.node_dict) @@ -642,6 +653,7 @@ def test_creates_new_edges_instance_no_inverse(self): full_kg_owl = '_'.join(self.kg_instance.full_kg.split('_')[0:-1]) + '_OWL.owl' annot, full = full_kg_owl[:-4] + '_AnnotationsOnly.nt', full_kg_owl[:-4] + '.nt' appends_to_existing_file(annotation_triples, self.kg_instance.write_location + annot, ' ') + clean_graph = updates_pkt_namespace_identifiers(self.kg_instance.graph, self.kg_instance.construct_approach) # test method shutil.copy(self.kg_instance.write_location + annot, self.kg_instance.write_location + full) @@ -656,14 +668,16 @@ def test_creates_new_edges_instance_no_inverse(self): ray.init(local_mode=True, ignore_reinit_error=True) actors = [ray.remote(self.kg_instance.EdgeConstructor).remote(args) for _ in range(self.kg_instance.cpus)] for i in range(0, len(edges)): actors[i % self.kg_instance.cpus].creates_new_edges.remote(edges[i]) - graphs = [self.kg_instance.graph] + ray.get([x.graph_getter.remote() for x in actors]) + res = ray.get([x.graph_getter.remote() for x in actors]) + g1 = [self.kg_instance.graph] + [x[0] for x in res]; g2 = [clean_graph] + [x[1] for x in res] error_dicts = dict(ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors]))); del actors ray.shutdown() # check that edges were added to the graph - self.kg_instance.graph = set(x for y in [set(x) for x in graphs] for x in y) - self.assertTrue(len(self.kg_instance.graph) > 0) - self.assertEqual(len(self.kg_instance.graph), 29) + graph1 = set(x for y in [set(x) for x in g1] for x in y) + graph2 = set(x for y in [set(x) for x in g2] for x in y) + self.assertEqual(len(graph1), 9702) + self.assertEqual(len(graph2), 9682) self.assertIsInstance(error_dicts, Dict) # check graph files were saved f_name = full_kg_owl[:-4] + '_AnnotationsOnly.nt' @@ -674,12 +688,12 @@ def test_creates_new_edges_instance_no_inverse(self): return None def test_creates_new_edges_instance_inverse(self): - """Tests the creates_new_edges method when applied to a kg with instance-based construction with - inverse relations.""" + """Tests the creates_new_edges method when applied to a kg with instance-based construction with inverse + relations.""" self.kg_instance2.reverse_relation_processor() # make sure that kg is empty - self.kg_instance2.graph = Graph() + self.kg_instance2.graph = Graph().parse(self.dir_loc + '/ontologies/so_with_imports.owl') # initialize metadata class meta = Metadata(self.kg_instance2.kg_version, self.kg_instance2.write_location, self.kg_instance2.full_kg, self.kg_instance2.node_data, self.kg_instance2.node_dict) @@ -689,6 +703,7 @@ def test_creates_new_edges_instance_inverse(self): full_kg_owl = '_'.join(self.kg_instance2.full_kg.split('_')[0:-1]) + '_OWL.owl' annot, full = full_kg_owl[:-4] + '_AnnotationsOnly.nt', full_kg_owl[:-4] + '.nt' appends_to_existing_file(annotation_triples, self.kg_instance2.write_location + annot, ' ') + clean_graph = updates_pkt_namespace_identifiers(self.kg_instance2.graph, self.kg_instance2.construct_approach) # test method shutil.copy(self.kg_instance2.write_location + annot, self.kg_instance2.write_location + full) @@ -703,14 +718,16 @@ def test_creates_new_edges_instance_inverse(self): ray.init(local_mode=True, ignore_reinit_error=True) actors = [ray.remote(self.kg_instance2.EdgeConstructor).remote(args) for _ in range(self.kg_instance2.cpus)] for i in range(0, len(edges)): actors[i % self.kg_instance2.cpus].creates_new_edges.remote(edges[i]) - graphs = [self.kg_instance2.graph] + ray.get([x.graph_getter.remote() for x in actors]) + res = ray.get([x.graph_getter.remote() for x in actors]) + g1 = [self.kg_instance2.graph] + [x[0] for x in res]; g2 = [clean_graph] + [x[1] for x in res] error_dicts = dict(ChainMap(*ray.get([x.error_dict_getter.remote() for x in actors]))); del actors ray.shutdown() # check that edges were added to the graph - self.kg_instance2.graph = set(x for y in [set(x) for x in graphs] for x in y) - self.assertTrue(len(self.kg_instance2.graph) > 0) - self.assertEqual(len(self.kg_instance2.graph), 36) + graph1 = set(x for y in [set(x) for x in g1] for x in y) + graph2 = set(x for y in [set(x) for x in g2] for x in y) + self.assertEqual(len(graph1), 9707) + self.assertEqual(len(graph2), 9687) self.assertIsInstance(error_dicts, Dict) # check graph files were saved f_name = full_kg_owl[:-4] + '_AnnotationsOnly.nt' @@ -721,8 +738,8 @@ def test_creates_new_edges_instance_inverse(self): return None def test_creates_new_edges_adding_metadata_to_kg_bad(self): - """Tests the creates_new_edges method and adds node metadata to the KG, but also makes sure that a - log file is written for genes that are not in the subclass_map.""" + """Tests the creates_new_edges method and adds node metadata to the KG, but also makes sure that a log file is + written for genes that are not in the subclass_map.""" self.kg_subclass.reverse_relation_processor() # make sure that kg is empty @@ -733,17 +750,12 @@ def test_creates_new_edges_adding_metadata_to_kg_bad(self): meta = Metadata(self.kg_subclass.kg_version, self.kg_subclass.write_location, self.kg_subclass.full_kg, self.kg_subclass.node_data, self.kg_subclass.node_dict) if self.kg_subclass.node_data: meta.metadata_processor(); meta.extract_metadata(self.kg_subclass.graph) - # alter gene list - adding genes not in the subclass_map dictionary - self.kg_subclass.edge_dict['gene-gene']['edge_list'] = [["1", "1080"], ["1", "4267"], ["4800", "10190"], - ["4800", "80219"], ["2729", "1962"], ["2729", "5096"], - ["8837", "6774"], ["8837", "8754"]] # test method args = {'construction': self.kg_subclass.construct_approach, 'edge_dict': self.kg_subclass.edge_dict, 'kg_owl': '', 'rel_dict': self.kg_subclass.relations_dict, 'ont_cls': self.kg_subclass.ont_classes, 'metadata': meta.creates_node_metadata, 'inverse_dict': self.kg_subclass.inverse_relations_dict, 'node_data': self.kg_subclass.node_data, 'obj_props': self.kg_subclass.obj_properties, - 'write_loc': self.kg_subclass.write_location} - edges = [x for x in self.kg_subclass.edge_dict.keys()] + 'write_loc': self.kg_subclass.write_location}; edges = [x for x in self.kg_subclass.edge_dict.keys()] ray.init(local_mode=True, ignore_reinit_error=True) actors = [ray.remote(self.kg_subclass.EdgeConstructor).remote(args) for _ in range(self.kg_subclass.cpus)] for i in range(0, len(edges)): actors[i % self.kg_subclass.cpus].creates_new_edges.remote(edges[i]) @@ -753,7 +765,20 @@ def test_creates_new_edges_adding_metadata_to_kg_bad(self): # check that log file was written out self.assertIsInstance(error_dicts, Dict) self.assertEqual(len(error_dicts), 1) - self.assertEqual(error_dicts, {'gene-gene': ['1080', '4267']}) + self.assertIn('gene-phenotype', error_dicts.keys()) + self.assertEqual(sorted(list(error_dicts['gene-phenotype'])), ['10', '20', '9']) + + return None + + def tests_graph_getter(self): + """Tests graph_getter method.""" + + results = self.inner_class.graph_getter() + + # verify results + self.assertTrue(len(results) == 2) + self.assertIsInstance(results[0], Graph) + self.assertIsInstance(results[1], Graph) return None diff --git a/tests/test_knowledge_graph_full.py b/tests/test_knowledge_graph_full.py index a60d9b60..fd53d590 100644 --- a/tests/test_knowledge_graph_full.py +++ b/tests/test_knowledge_graph_full.py @@ -65,10 +65,10 @@ def setUp(self): "uri": ["http://www.ncbi.nlm.nih.gov/gene/", "http://purl.obolibrary.org/obo/"], "edge_list": [["2", "SO_0000162"], ["2", "SO_0000196"], - ["2", "SO_0000323"], ["9", "SO_0001490"], - ["9", "SO_0000301"], ["9", "SO_0001560"], - ["9", "SO_0001560"], ["10", "SO_0000444"], - ["10", "SO_0002138"], ["10", "SO_0000511"]]}, + ["3", "SO_0000323"], ["9", "SO_0001490"], + ["10", "SO_0000301"], ["11", "SO_0001560"], + ["12", "SO_0001560"], ["17", "SO_0000444"], + ["18", "SO_0002138"], ["20", "SO_0000511"]]}, "gene-gene": {"data_type": "entity-entity", "edge_relation": "RO_0002435", "uri": ["http://www.ncbi.nlm.nih.gov/gene/", @@ -83,8 +83,7 @@ def setUp(self): "edge_list": [["DOID_3075", "DOID_1080"], ["DOID_3075", "DOID_4267"], ["DOID_4800", "DOID_10190"], ["DOID_4800", "DOID_80219"], ["DOID_2729", "DOID_1962"], ["DOID_2729", "DOID_5096"], - ["DOID_8837", "DOID_6774"], ["DOID_8837", "DOID_8754"]]}, - "entity_namespaces": {"gene": "http://purl.uniprot.org/geneid/"} + ["DOID_8837", "DOID_6774"], ["DOID_8837", "DOID_8754"]]} } # save data diff --git a/tests/test_knowledge_graph_partial.py b/tests/test_knowledge_graph_partial.py index 2fad8307..87223512 100644 --- a/tests/test_knowledge_graph_partial.py +++ b/tests/test_knowledge_graph_partial.py @@ -65,10 +65,10 @@ def setUp(self): "uri": ["http://www.ncbi.nlm.nih.gov/gene/", "http://purl.obolibrary.org/obo/"], "edge_list": [["2", "SO_0000162"], ["2", "SO_0000196"], - ["2", "SO_0000323"], ["9", "SO_0001490"], - ["9", "SO_0000301"], ["9", "SO_0001560"], - ["9", "SO_0001560"], ["10", "SO_0000444"], - ["10", "SO_0002138"], ["10", "SO_0000511"]]}, + ["3", "SO_0000323"], ["9", "SO_0001490"], + ["10", "SO_0000301"], ["11", "SO_0001560"], + ["12", "SO_0001560"], ["17", "SO_0000444"], + ["18", "SO_0002138"], ["20", "SO_0000511"]]}, "gene-gene": {"data_type": "entity-entity", "edge_relation": "RO_0002435", "uri": ["http://www.ncbi.nlm.nih.gov/gene/", @@ -83,8 +83,7 @@ def setUp(self): "edge_list": [["DOID_3075", "DOID_1080"], ["DOID_3075", "DOID_4267"], ["DOID_4800", "DOID_10190"], ["DOID_4800", "DOID_80219"], ["DOID_2729", "DOID_1962"], ["DOID_2729", "DOID_5096"], - ["DOID_8837", "DOID_6774"], ["DOID_8837", "DOID_8754"]]}, - "entity_namespaces": {"gene": "http://purl.uniprot.org/geneid/"} + ["DOID_8837", "DOID_6774"], ["DOID_8837", "DOID_8754"]]} } # save data diff --git a/tests/test_knowledge_graph_post_closure.py b/tests/test_knowledge_graph_post_closure.py index 1f80b76c..3cd0caa9 100644 --- a/tests/test_knowledge_graph_post_closure.py +++ b/tests/test_knowledge_graph_post_closure.py @@ -68,10 +68,10 @@ def setUp(self): "uri": ["http://www.ncbi.nlm.nih.gov/gene/", "http://purl.obolibrary.org/obo/"], "edge_list": [["2", "SO_0000162"], ["2", "SO_0000196"], - ["2", "SO_0000323"], ["9", "SO_0001490"], - ["9", "SO_0000301"], ["9", "SO_0001560"], - ["9", "SO_0001560"], ["10", "SO_0000444"], - ["10", "SO_0002138"], ["10", "SO_0000511"]]}, + ["3", "SO_0000323"], ["9", "SO_0001490"], + ["10", "SO_0000301"], ["11", "SO_0001560"], + ["12", "SO_0001560"], ["17", "SO_0000444"], + ["18", "SO_0002138"], ["20", "SO_0000511"]]}, "gene-gene": {"data_type": "entity-entity", "edge_relation": "RO_0002435", "uri": ["http://www.ncbi.nlm.nih.gov/gene/", @@ -86,8 +86,7 @@ def setUp(self): "edge_list": [["DOID_3075", "DOID_1080"], ["DOID_3075", "DOID_4267"], ["DOID_4800", "DOID_10190"], ["DOID_4800", "DOID_80219"], ["DOID_2729", "DOID_1962"], ["DOID_2729", "DOID_5096"], - ["DOID_8837", "DOID_6774"], ["DOID_8837", "DOID_8754"]]}, - "entity_namespaces": {"gene": "http://purl.uniprot.org/geneid/"} + ["DOID_8837", "DOID_6774"], ["DOID_8837", "DOID_8754"]]} } # save data diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 4b0e6161..d9a1a71d 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -129,7 +129,7 @@ def test_creates_node_metadata_relations(self): # check that nothing is returned if the entities are classes updated_graph_2 = self.metadata.creates_node_metadata(ent=['http://purl.obolibrary.org/obo/RO_0002597'], e_type=['class'], key_type='relations') - self.assertTrue(updated_graph_2 is None) + self.assertTrue(len(updated_graph_2) == 2) # test when the node does not have metadata updated_graph_3 = self.metadata.creates_node_metadata(['http://www.ncbi.nlm.nih.gov/gene/None'],