Skip to content

Commit

Permalink
Add ability to generate summary report in different formats via KGX CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
deepakunni3 committed Feb 23, 2021
1 parent d5bac10 commit 34db46f
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 12 deletions.
9 changes: 6 additions & 3 deletions kgx/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from kgx.config import get_logger, get_config
from kgx.cli.cli_utils import get_file_types, get_transformer, parse_source, apply_operations, graph_summary, validate, \
neo4j_download, neo4j_upload, transform, merge
neo4j_download, neo4j_upload, transform, merge, summary_report_types

log = get_logger()
config = get_config()
Expand All @@ -31,9 +31,10 @@ def cli():
@click.option('--input-format', required=True, help=f'The input format. Can be one of {get_file_types()}')
@click.option('--input-compression', required=False, help='The input compression type')
@click.option('--output', required=True, type=click.Path(exists=False))
@click.option('--report-type', required=False, type=str, help=f'The summary report type. Can be one of {summary_report_types.keys()}', default='kgx-map')
@click.option('--node-facet-properties', required=False, multiple=True, help='A list of node properties from which to generate counts per value for those properties')
@click.option('--edge-facet-properties', required=False, multiple=True, help='A list of edge properties from which to generate counts per value for those properties')
def graph_summary_wrapper(inputs: List[str], input_format: str, input_compression: str, output: str, node_facet_properties: Optional[Set], edge_facet_properties: Optional[Set]):
def graph_summary_wrapper(inputs: List[str], input_format: str, input_compression: str, output: str, report_type: str, node_facet_properties: Optional[Set], edge_facet_properties: Optional[Set]):
"""
Loads and summarizes a knowledge graph from a set of input files.
\f
Expand All @@ -48,12 +49,14 @@ def graph_summary_wrapper(inputs: List[str], input_format: str, input_compressio
The input compression type
output: str
Where to write the output (stdout, by default)
report_type: str
The summary report type
node_facet_properties: Optional[Set]
A list of node properties from which to generate counts per value for those properties. For example, ``['provided_by']``
edge_facet_properties: Optional[Set]
A list of edge properties from which to generate counts per value for those properties. For example, ``['provided_by']``
"""
graph_summary(inputs, input_format, input_compression, output, node_facet_properties=list(node_facet_properties), edge_facet_properties=list(edge_facet_properties))
graph_summary(inputs, input_format, input_compression, output, report_type, node_facet_properties=list(node_facet_properties), edge_facet_properties=list(edge_facet_properties))


@cli.command('validate')
Expand Down
18 changes: 15 additions & 3 deletions kgx/cli/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
from typing import List, Tuple, Any, Optional, Dict, Set

import yaml

from kgx.operations import knowledge_map
from kgx.transformers.sssom_transformer import SssomTransformer

from kgx import PandasTransformer, NeoTransformer, Validator, RdfTransformer, NtTransformer, RsaTransformer, \
RdfOwlTransformer, ObographJsonTransformer, JsonlTransformer, JsonTransformer, Transformer
from kgx.config import get_logger
from kgx.graph.base_graph import BaseGraph
from kgx.operations.graph_merge import merge_all_graphs
from kgx.operations.summarize_graph import summarize_graph
from kgx.operations import summarize_graph

_transformers = {
'tar': PandasTransformer,
Expand All @@ -30,6 +32,11 @@
'sssom': SssomTransformer
}

summary_report_types = {
'kgx-map': summarize_graph.summarize_graph,
'knowledge-map': knowledge_map.summarize_graph
}

log = get_logger()


Expand Down Expand Up @@ -72,7 +79,7 @@ def get_file_types() -> Tuple:
return tuple(_transformers.keys())


def graph_summary(inputs: List[str], input_format: str, input_compression: Optional[str], output: Optional[str], node_facet_properties: Optional[List] = None, edge_facet_properties: Optional[List] = None) -> Dict:
def graph_summary(inputs: List[str], input_format: str, input_compression: Optional[str], output: Optional[str], report_type: str, node_facet_properties: Optional[List] = None, edge_facet_properties: Optional[List] = None) -> Dict:
"""
Loads and summarizes a knowledge graph from a set of input files.
Expand All @@ -86,6 +93,8 @@ def graph_summary(inputs: List[str], input_format: str, input_compression: Optio
The input compression type
output: Optional[str]
Where to write the output (stdout, by default)
report_type: str
The summary report type
node_facet_properties: Optional[List]
A list of node properties from which to generate counts per value for those properties. For example, ``['provided_by']``
edge_facet_properties: Optional[List]
Expand All @@ -101,7 +110,10 @@ def graph_summary(inputs: List[str], input_format: str, input_compression: Optio
for file in inputs:
transformer.parse(file, input_format=input_format, compression=input_compression)

stats = summarize_graph(transformer.graph, name='Graph', node_facet_properties=node_facet_properties, edge_facet_properties=edge_facet_properties)
if report_type in summary_report_types:
stats = summary_report_types[report_type](graph=transformer.graph, name='Graph', node_facet_properties=node_facet_properties, edge_facet_properties=edge_facet_properties)
else:
raise ValueError(f"report_type must be one of {summary_report_types.keys()}")
if output:
WH = open(output, 'w')
WH.write(yaml.dump(stats))
Expand Down
6 changes: 3 additions & 3 deletions kgx/operations/knowledge_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def generate_knowledge_map(graph: BaseGraph, name: str, filename: str) -> None:
json.dump(knowledge_map, WH, indent=4)


def summarize_graph(graph: BaseGraph, name: str = None) -> Dict:
def summarize_graph(graph: BaseGraph, name: str = None, **kwargs) -> Dict:
"""
Generate a knowlege map that describes the composition of the graph.
Expand All @@ -41,8 +41,8 @@ def summarize_graph(graph: BaseGraph, name: str = None) -> Dict:
The graph
name: Optional[str]
Name for the graph
filename: str
The file to write the knowledge map to
kwargs: Dict
Any additional arguments
Returns
-------
Expand Down
22 changes: 19 additions & 3 deletions tests/unit/test_cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ def test_get_file_types():
assert 'ttl' in file_types


def test_graph_summary():
def test_graph_summary1():
inputs = [
os.path.join(resource_dir, 'graph_nodes.tsv'),
os.path.join(resource_dir, 'graph_edges.tsv')
]
output = os.path.join(target_dir, 'graph_stats.yaml')
summary_stats = graph_summary(inputs, 'tsv', None, output)
output = os.path.join(target_dir, 'graph_stats1.yaml')
summary_stats = graph_summary(inputs, 'tsv', None, output, report_type='kgx-map')
pprint.pprint(summary_stats)

assert os.path.exists(output)
Expand All @@ -58,6 +58,22 @@ def test_graph_summary():
assert 'biolink:interacts_with' in summary_stats['edge_stats']['predicates']


def test_graph_summary2():
inputs = [
os.path.join(resource_dir, 'graph_nodes.tsv'),
os.path.join(resource_dir, 'graph_edges.tsv')
]
output = os.path.join(target_dir, 'graph_stats2.yaml')
summary_stats = graph_summary(inputs, 'tsv', None, output, report_type='knowledge-map')
pprint.pprint(summary_stats)

assert os.path.exists(output)
assert summary_stats
assert 'knowledge_map' in summary_stats
assert 'nodes' in summary_stats['knowledge_map']
assert 'edges' in summary_stats['knowledge_map']


def test_validate():
inputs = [
os.path.join(resource_dir, 'valid.json'),
Expand Down

0 comments on commit 34db46f

Please sign in to comment.