Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/biolink/kgx
Browse files Browse the repository at this point in the history
  • Loading branch information
sierra-moxon committed Aug 13, 2022
2 parents 8afa54f + 13ac53f commit a981a3e
Show file tree
Hide file tree
Showing 20 changed files with 56 additions and 60 deletions.
4 changes: 0 additions & 4 deletions kgx/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,6 @@ def _process_knowledge_source(ksf: str, spec: str) -> Union[str, bool, Tuple]:
if spec.lower() == "true":
return True
elif spec.lower() == "false":
print("returning false")
return False
else:
# If a Tuple, expect a comma-delimited string?
Expand Down Expand Up @@ -521,7 +520,6 @@ def transform(
pool.join()
graphs = [r.get() for r in results]
else:
print("no transform config")
source_dict: Dict = {
"input": {
"format": input_format,
Expand Down Expand Up @@ -557,9 +555,7 @@ def transform(
+ "' are all rewrite specifications!"
)
else:
print("not a tuple")
source_dict["input"][ksf] = ksf_spec
print(source_dict)

name = os.path.basename(inputs[0])
transform_source(
Expand Down
2 changes: 1 addition & 1 deletion kgx/graph_operations/summarize_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def __init__(self, category_curie: str, summary):
# ...so that Category related entries at that
# higher level may be properly initialized
# for subsequent facet metadata access
if not self.category_curie == "unknown":
if self.category_curie != "unknown":
self.summary.node_stats[NODE_CATEGORIES].add(self.category_curie)
self.summary.node_stats[NODE_ID_PREFIXES_BY_CATEGORY][
self.category_curie
Expand Down
2 changes: 0 additions & 2 deletions kgx/sink/rdf_sink.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,6 @@ def uriref(self, identifier: str) -> URIRef:
# identifier is an entity
fixed_identifier = identifier
if fixed_identifier.startswith(":"):
# TODO: this should be handled upstream by prefixcommons-py
fixed_identifier = fixed_identifier.replace(":", "", 1)
if " " in identifier:
fixed_identifier = fixed_identifier.replace(" ", "_")
Expand Down Expand Up @@ -379,7 +378,6 @@ def _get_property_type(self, p: str) -> str:
The type for property name
"""
# TODO: this should be properly defined in the model
default_uri_types = {
"biolink:type",
"biolink:category",
Expand Down
3 changes: 2 additions & 1 deletion kgx/source/json_source.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import gzip
import typing
from typing import Optional, Generator, Any
import ijson
from itertools import chain
Expand Down Expand Up @@ -34,7 +35,7 @@ def parse(
format: str = "json",
compression: Optional[str] = None,
**kwargs: Any
) -> Generator:
) -> typing.Generator:
"""
This method reads from a JSON and yields records.
Expand Down
4 changes: 3 additions & 1 deletion kgx/source/jsonl_source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import gzip
import re
import typing

import jsonlines
from typing import Optional, Any, Generator, Dict

Expand All @@ -25,7 +27,7 @@ def parse(
format: str = "jsonl",
compression: Optional[str] = None,
**kwargs: Any,
) -> Generator:
) -> typing.Generator:
"""
This method reads from JSON Lines and yields records.
Expand Down
5 changes: 2 additions & 3 deletions kgx/source/neo_source.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import typing
from typing import Any, Dict, List, Optional, Iterator, Tuple, Generator

from neo4j import GraphDatabase, Neo4jDriver
Expand Down Expand Up @@ -48,7 +49,7 @@ def parse(
is_directed: bool = True,
page_size: int = 50000,
**kwargs: Any,
) -> Generator:
) -> typing.Generator:
"""
This method reads from Neo4j instance and yields records
Expand Down Expand Up @@ -338,7 +339,6 @@ def load_node(self, node_data: Dict) -> Optional[Tuple]:
"""
self.node_count += 1
# TODO: remove the seen_nodes
self.seen_nodes.add(node_data["id"])

self.set_node_provenance(node_data)
Expand Down Expand Up @@ -458,7 +458,6 @@ def get_pages(
An iterator for a list of records from Neo4j. The size of the list is ``page_size``
"""
# TODO: use async
# itertools.count(0) starts counting from zero, and would run indefinitely without a return statement.
# it's distinguished from applying a while loop via providing an index which is formative with the for statement
for i in itertools.count(0):
Expand Down
5 changes: 2 additions & 3 deletions kgx/source/obograph_source.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import gzip
import typing
from itertools import chain
from typing import Optional, Tuple, Dict, Generator, Any
import ijson
Expand Down Expand Up @@ -33,7 +34,7 @@ def parse(
format: str = "json",
compression: Optional[str] = None,
**kwargs: Any,
) -> Generator:
) -> typing.Generator:
"""
This method reads from JSON and yields records.
Expand Down Expand Up @@ -182,7 +183,6 @@ def read_edge(self, edge: Dict) -> Optional[Tuple]:
if mapping:
element = self.toolkit.get_element(mapping)

# TODO: not sure how this exception would be thrown here.. under what conditions?
except ValueError as e:
self.owner.log_error(
entity=str(edge["pred"]),
Expand Down Expand Up @@ -254,7 +254,6 @@ def get_category(self, curie: str, node: dict) -> Optional[str]:

if not category or category == "biolink:OntologyClass":
prefix = PrefixManager.get_prefix(curie)
# TODO: the mapping should be via biolink-model lookups
if prefix == "HP":
category = "biolink:PhenotypicFeature"
elif prefix == "CHEBI":
Expand Down
3 changes: 2 additions & 1 deletion kgx/source/owl_source.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import typing
from typing import Set, Optional, Generator, Any

import rdflib
Expand Down Expand Up @@ -38,7 +39,7 @@ def parse(
format: str = "owl",
compression: Optional[str] = None,
**kwargs: Any,
) -> Generator:
) -> typing.Generator:
"""
This method reads from an OWL and yields records.
Expand Down
7 changes: 2 additions & 5 deletions kgx/source/rdf_source.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import gzip
import typing
from typing import Set, Dict, Union, Optional, Any, Tuple, List, Generator

import rdflib
Expand Down Expand Up @@ -43,9 +44,6 @@ class RdfSource(Source):
def __init__(self, owner):
super().__init__(owner)
self.DEFAULT = Namespace(self.prefix_manager.prefix_map[""])
# TODO: use OBO IRI from biolink model context once
# https://github.com/biolink/biolink-model/issues/211 is resolved
# self.OBO = Namespace('http://purl.obolibrary.org/obo/')
self.OBAN = Namespace(self.prefix_manager.prefix_map["OBAN"])
self.PMID = Namespace(self.prefix_manager.prefix_map["PMID"])
self.BIOLINK = Namespace(self.prefix_manager.prefix_map["biolink"])
Expand All @@ -65,7 +63,6 @@ def __init__(self, owner):
set(self.toolkit.get_all_edge_properties(formatted=True))
)

# TODO: validate expansion of the scope of this statement to include 'knowledge_source' and its descendants?
for ksf in knowledge_provenance_properties:
self.node_property_predicates.add(
URIRef(self.prefix_manager.expand("biolink:" + ksf))
Expand Down Expand Up @@ -133,7 +130,7 @@ def parse(
format: str = "nt",
compression: Optional[str] = None,
**kwargs: Any,
) -> Generator:
) -> typing.Generator:
"""
This method reads from RDF N-Triples and yields records.
Expand Down
3 changes: 2 additions & 1 deletion kgx/source/sssom_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
import gzip
import re
import typing

import pandas as pd
from typing import Optional, Generator, Any, Dict, Tuple
Expand Down Expand Up @@ -70,7 +71,7 @@ def parse(
format: str,
compression: Optional[str] = None,
**kwargs: Any,
) -> Generator:
) -> typing.Generator:
"""
Parse a SSSOM TSV
Expand Down
7 changes: 3 additions & 4 deletions kgx/source/trapi_source.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import gzip
import typing

import ijson
from itertools import chain
from typing import Dict, Tuple, Generator, Optional, Any

from kgx.source.json_source import JsonSource


# TODO: update for TRAPI 1.0 spec


class TrapiSource(JsonSource):
"""
TrapiSource is responsible for reading data as records
Expand All @@ -26,7 +25,7 @@ def parse(
format: str = "json",
compression: Optional[str] = None,
**kwargs: Any
) -> Generator:
) -> typing.Generator:
"""
This method reads from a JSON and yields records.
Expand Down
7 changes: 2 additions & 5 deletions kgx/source/tsv_source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import tarfile
import typing
from typing import Dict, Tuple, Any, Generator, Optional, List
import pandas as pd

Expand Down Expand Up @@ -58,7 +59,7 @@ def parse(
format: str,
compression: Optional[str] = None,
**kwargs: Any,
) -> Generator:
) -> typing.Generator:
"""
This method reads from a TSV/CSV and yields records.
Expand Down Expand Up @@ -128,9 +129,6 @@ def parse(
continue

f = tar.extractfile(member)
# TODO: can this somehow be streamed here?
# Question: who put the above comment here? One wonders whether the use of the chunk-based
# file_iter, with the Generator yield statement below, isn't effectively streaming the file?
file_iter = pd.read_csv(
f,
dtype=str,
Expand All @@ -154,7 +152,6 @@ def parse(
continue

f = tar.extractfile(member)
# TODO: can this somehow be streamed here?
file_iter = pd.read_csv(
f,
dtype=str,
Expand Down
2 changes: 0 additions & 2 deletions kgx/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,6 @@ def transform(
if ksf in input_args:
ks_args[ksf] = input_args[ksf]

# TODO: does this call also need the default_provenance named argument?
intermediate_source_generator = intermediate_source.parse(
intermediate_sink.graph, **ks_args
)
Expand Down Expand Up @@ -362,7 +361,6 @@ def process(self, source: Generator, sink: Sink) -> None:
# last element of rec is the node properties
sink.write_node(rec[-1])

# TODO: review whether or not the 'save()' method need to be 'knowledge_source' aware?
def save(self, output_args: Dict) -> None:
"""
Save data from the in-memory store to a desired sink.
Expand Down
6 changes: 0 additions & 6 deletions kgx/utils/infores.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ def _process_infores(source: str) -> str:
infores = re.sub(r"[\W]", "", infores)
infores = re.sub(r"_", "-", infores)

# TODO: to be fully compliant, the InfoRes needs to have the 'infores' prefix?
infores = "infores:" + infores

return infores
Expand Down Expand Up @@ -336,7 +335,6 @@ def set_provenance_map(self, kwargs: Dict):
"""
if "default_provenance" in kwargs:
self.default_provenance = kwargs.pop("default_provenance")
print(self.default_provenance)

ksf_found = []
for ksf in knowledge_provenance_properties:
Expand All @@ -353,9 +351,7 @@ def set_provenance_map(self, kwargs: Dict):
)
else:
ir = self.get_mapping(ksf)
print(ir)
self.mapping[ksf] = ir.set_provenance_map_entry(ksf_value)
print("mapping", self.mapping)

# if none specified, add at least one generic 'knowledge_source'
if not ksf_found:
Expand Down Expand Up @@ -384,7 +380,6 @@ def set_provenance(self, ksf: str, data: Dict):
"""

if ksf not in data.keys():
print(data.keys())
if ksf in self.mapping and not isinstance(self.mapping[ksf], dict):
data[ksf] = self.mapping[ksf]()
else:
Expand All @@ -394,7 +389,6 @@ def set_provenance(self, ksf: str, data: Dict):
else:
# If data is s a non-string iterable then, coerce into a simple list of sources
if isinstance(data[ksf], (list, set, tuple)):
print("is a tuple???")
sources = list(data[ksf])
else:
# wraps knowledge sources that are multivalued in a list even if single valued
Expand Down
5 changes: 0 additions & 5 deletions kgx/utils/kgx_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ def expand(

_default_toolkit = None

# TODO: not sure how threadsafe this simple-minded Toolkit cache is
_toolkit_versions: Dict[str, Toolkit] = dict()


Expand Down Expand Up @@ -418,7 +417,6 @@ def get_prefix_prioritization_map() -> Dict[str, List]:
"""
toolkit = get_toolkit()
prefix_prioritization_map = {}
# TODO: Lookup via Biolink CURIE should be supported in bmt
descendants = toolkit.get_descendants("named thing")
descendants.append("named thing")
for d in descendants:
Expand Down Expand Up @@ -493,7 +491,6 @@ def get_biolink_property_types() -> Dict:
property_type = get_type_for_property(p)
types[p] = property_type

# TODO: this should be moved to biolink model
types["biolink:predicate"] = "uriorcurie"
types["biolink:edge_label"] = "uriorcurie"
return types
Expand All @@ -503,8 +500,6 @@ def get_type_for_property(p: str) -> str:
"""
Get type for a property.
TODO: Move this to biolink-model-default_toolkit
Parameters
----------
p: str
Expand Down
1 change: 0 additions & 1 deletion kgx/utils/rdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
property_mapping: Dict = dict()
reverse_property_mapping: Dict = dict()

# TODO: this should be populated via bmt
is_property_multivalued = {
"id": False,
"subject": False,
Expand Down
6 changes: 1 addition & 5 deletions kgx/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,7 @@ def get_the_validator(
Creates and manages a default singleton Validator in the module, when called
"""
if not cls._the_validator:
# TODO: This may need to be adjusted periodically,
# but for now, we reset the model to a recent version
cls.set_biolink_model("2.2.11")
cls.set_biolink_model("3.0.0")
cls._the_validator = Validator(
verbose=verbose,
progress_monitor=progress_monitor,
Expand All @@ -92,7 +90,6 @@ def __init__(
Callable[[GraphEntityType, List], None]
] = progress_monitor

# TODO: fix... this attribute is not used anywhere at the moment?
self.schema: Optional[str] = schema

# internal attributes
Expand Down Expand Up @@ -274,7 +271,6 @@ def get_required_edge_properties(toolkit: Optional[Toolkit] = None) -> list:
def validate(self, graph: BaseGraph):
"""
Validate nodes and edges in a graph.
TODO: Support strict mode
Parameters
----------
Expand Down
Loading

0 comments on commit a981a3e

Please sign in to comment.