diff --git a/kgx/cli/__init__.py b/kgx/cli/__init__.py index 9c326cf7..aa7e5d91 100644 --- a/kgx/cli/__init__.py +++ b/kgx/cli/__init__.py @@ -4,7 +4,7 @@ from kgx.config import get_logger, get_config from kgx.cli.cli_utils import get_file_types, get_transformer, parse_source, apply_operations, graph_summary, validate, \ - neo4j_download, neo4j_upload, transform, merge + neo4j_download, neo4j_upload, transform, merge, summary_report_types log = get_logger() config = get_config() @@ -31,9 +31,10 @@ def cli(): @click.option('--input-format', required=True, help=f'The input format. Can be one of {get_file_types()}') @click.option('--input-compression', required=False, help='The input compression type') @click.option('--output', required=True, type=click.Path(exists=False)) +@click.option('--report-type', required=False, type=str, help=f'The summary report type. Can be one of {summary_report_types.keys()}', default='kgx-map') @click.option('--node-facet-properties', required=False, multiple=True, help='A list of node properties from which to generate counts per value for those properties') @click.option('--edge-facet-properties', required=False, multiple=True, help='A list of edge properties from which to generate counts per value for those properties') -def graph_summary_wrapper(inputs: List[str], input_format: str, input_compression: str, output: str, node_facet_properties: Optional[Set], edge_facet_properties: Optional[Set]): +def graph_summary_wrapper(inputs: List[str], input_format: str, input_compression: str, output: str, report_type: str, node_facet_properties: Optional[Set], edge_facet_properties: Optional[Set]): """ Loads and summarizes a knowledge graph from a set of input files. \f @@ -48,12 +49,14 @@ def graph_summary_wrapper(inputs: List[str], input_format: str, input_compressio The input compression type output: str Where to write the output (stdout, by default) + report_type: str + The summary report type node_facet_properties: Optional[Set] A list of node properties from which to generate counts per value for those properties. For example, ``['provided_by']`` edge_facet_properties: Optional[Set] A list of edge properties from which to generate counts per value for those properties. For example, ``['provided_by']`` """ - graph_summary(inputs, input_format, input_compression, output, node_facet_properties=list(node_facet_properties), edge_facet_properties=list(edge_facet_properties)) + graph_summary(inputs, input_format, input_compression, output, report_type, node_facet_properties=list(node_facet_properties), edge_facet_properties=list(edge_facet_properties)) @cli.command('validate') diff --git a/kgx/cli/cli_utils.py b/kgx/cli/cli_utils.py index d3ddf158..779255ae 100644 --- a/kgx/cli/cli_utils.py +++ b/kgx/cli/cli_utils.py @@ -5,6 +5,8 @@ from typing import List, Tuple, Any, Optional, Dict, Set import yaml + +from kgx.operations import knowledge_map from kgx.transformers.sssom_transformer import SssomTransformer from kgx import PandasTransformer, NeoTransformer, Validator, RdfTransformer, NtTransformer, RsaTransformer, \ @@ -12,7 +14,7 @@ from kgx.config import get_logger from kgx.graph.base_graph import BaseGraph from kgx.operations.graph_merge import merge_all_graphs -from kgx.operations.summarize_graph import summarize_graph +from kgx.operations import summarize_graph _transformers = { 'tar': PandasTransformer, @@ -30,6 +32,11 @@ 'sssom': SssomTransformer } +summary_report_types = { + 'kgx-map': summarize_graph.summarize_graph, + 'knowledge-map': knowledge_map.summarize_graph +} + log = get_logger() @@ -72,7 +79,7 @@ def get_file_types() -> Tuple: return tuple(_transformers.keys()) -def graph_summary(inputs: List[str], input_format: str, input_compression: Optional[str], output: Optional[str], node_facet_properties: Optional[List] = None, edge_facet_properties: Optional[List] = None) -> Dict: +def graph_summary(inputs: List[str], input_format: str, input_compression: Optional[str], output: Optional[str], report_type: str, node_facet_properties: Optional[List] = None, edge_facet_properties: Optional[List] = None) -> Dict: """ Loads and summarizes a knowledge graph from a set of input files. @@ -86,6 +93,8 @@ def graph_summary(inputs: List[str], input_format: str, input_compression: Optio The input compression type output: Optional[str] Where to write the output (stdout, by default) + report_type: str + The summary report type node_facet_properties: Optional[List] A list of node properties from which to generate counts per value for those properties. For example, ``['provided_by']`` edge_facet_properties: Optional[List] @@ -101,7 +110,10 @@ def graph_summary(inputs: List[str], input_format: str, input_compression: Optio for file in inputs: transformer.parse(file, input_format=input_format, compression=input_compression) - stats = summarize_graph(transformer.graph, name='Graph', node_facet_properties=node_facet_properties, edge_facet_properties=edge_facet_properties) + if report_type in summary_report_types: + stats = summary_report_types[report_type](graph=transformer.graph, name='Graph', node_facet_properties=node_facet_properties, edge_facet_properties=edge_facet_properties) + else: + raise ValueError(f"report_type must be one of {summary_report_types.keys()}") if output: WH = open(output, 'w') WH.write(yaml.dump(stats)) diff --git a/kgx/operations/knowledge_map.py b/kgx/operations/knowledge_map.py index 8e752eb6..3ebde3fd 100644 --- a/kgx/operations/knowledge_map.py +++ b/kgx/operations/knowledge_map.py @@ -31,7 +31,7 @@ def generate_knowledge_map(graph: BaseGraph, name: str, filename: str) -> None: json.dump(knowledge_map, WH, indent=4) -def summarize_graph(graph: BaseGraph, name: str = None) -> Dict: +def summarize_graph(graph: BaseGraph, name: str = None, **kwargs) -> Dict: """ Generate a knowlege map that describes the composition of the graph. @@ -41,8 +41,8 @@ def summarize_graph(graph: BaseGraph, name: str = None) -> Dict: The graph name: Optional[str] Name for the graph - filename: str - The file to write the knowledge map to + kwargs: Dict + Any additional arguments Returns ------- diff --git a/tests/unit/test_cli_utils.py b/tests/unit/test_cli_utils.py index b1d7bc7d..7c395271 100644 --- a/tests/unit/test_cli_utils.py +++ b/tests/unit/test_cli_utils.py @@ -37,13 +37,13 @@ def test_get_file_types(): assert 'ttl' in file_types -def test_graph_summary(): +def test_graph_summary1(): inputs = [ os.path.join(resource_dir, 'graph_nodes.tsv'), os.path.join(resource_dir, 'graph_edges.tsv') ] - output = os.path.join(target_dir, 'graph_stats.yaml') - summary_stats = graph_summary(inputs, 'tsv', None, output) + output = os.path.join(target_dir, 'graph_stats1.yaml') + summary_stats = graph_summary(inputs, 'tsv', None, output, report_type='kgx-map') pprint.pprint(summary_stats) assert os.path.exists(output) @@ -58,6 +58,22 @@ def test_graph_summary(): assert 'biolink:interacts_with' in summary_stats['edge_stats']['predicates'] +def test_graph_summary2(): + inputs = [ + os.path.join(resource_dir, 'graph_nodes.tsv'), + os.path.join(resource_dir, 'graph_edges.tsv') + ] + output = os.path.join(target_dir, 'graph_stats2.yaml') + summary_stats = graph_summary(inputs, 'tsv', None, output, report_type='knowledge-map') + pprint.pprint(summary_stats) + + assert os.path.exists(output) + assert summary_stats + assert 'knowledge_map' in summary_stats + assert 'nodes' in summary_stats['knowledge_map'] + assert 'edges' in summary_stats['knowledge_map'] + + def test_validate(): inputs = [ os.path.join(resource_dir, 'valid.json'),