From c2e7b3de14020d30cf67a2f4585327f32f8ba4c9 Mon Sep 17 00:00:00 2001 From: Martin Coll Date: Mon, 26 Aug 2024 17:04:43 -0300 Subject: [PATCH 1/4] single node and bad node --- tests/conftest.py | 8 ++++++++ tests/test_files/bad_node_without_id.json | 1 + tests/test_files/single_node.json | 1 + tests/test_schema.py | 9 +++++++++ 4 files changed, 19 insertions(+) create mode 100644 tests/test_files/bad_node_without_id.json create mode 100644 tests/test_files/single_node.json diff --git a/tests/conftest.py b/tests/conftest.py index 44e6ab5..ec121da 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,6 +16,9 @@ def validator(): def empty(): return json.load(open(f"{json_dir}/empty.json", "r")) +@pytest.fixture +def single_node(): + return json.load(open(f"{json_dir}/single_node.json", "r")) @pytest.fixture def bad_top_level_field(): @@ -27,6 +30,11 @@ def bad_network_type(): return json.load(open(f"{json_dir}/bad_network_type.json", "r")) +@pytest.fixture +def bad_node_without_id(): + return json.load(open(f"{json_dir}/bad_node_without_id.json", "r")) + + @pytest.fixture def metadata_as_list(): return json.load(open(f"{json_dir}/metadata_as_list.json", "r")) diff --git a/tests/test_files/bad_node_without_id.json b/tests/test_files/bad_node_without_id.json new file mode 100644 index 0000000..05ec473 --- /dev/null +++ b/tests/test_files/bad_node_without_id.json @@ -0,0 +1 @@ +{"incidences": [], "nodes": [ { } ]} \ No newline at end of file diff --git a/tests/test_files/single_node.json b/tests/test_files/single_node.json new file mode 100644 index 0000000..e9f08e4 --- /dev/null +++ b/tests/test_files/single_node.json @@ -0,0 +1 @@ +{"incidences": [], "nodes": [ { "node": 42 } ]} \ No newline at end of file diff --git a/tests/test_schema.py b/tests/test_schema.py index fb9f219..9fcb436 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -16,6 +16,15 @@ def test_bad_network_type(validator, bad_network_type): validator(bad_network_type) +def test_bad_node_without_id(validator, bad_node_without_id): + with pytest.raises(ValueError): + validator(bad_node_without_id) + + +def test_single_node(validator, single_node): + validator(single_node) + + def test_metadata_as_list(validator, metadata_as_list): with pytest.raises(ValueError): validator(metadata_as_list) From 39f68c78ab910597cc6dd2de825f24704b221640 Mon Sep 17 00:00:00 2001 From: Martin Coll Date: Mon, 26 Aug 2024 17:06:38 -0300 Subject: [PATCH 2/4] single edge --- tests/conftest.py | 4 ++++ tests/test_files/single_edge.json | 1 + tests/test_schema.py | 4 ++++ 3 files changed, 9 insertions(+) create mode 100644 tests/test_files/single_edge.json diff --git a/tests/conftest.py b/tests/conftest.py index ec121da..16a3049 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,6 +20,10 @@ def empty(): def single_node(): return json.load(open(f"{json_dir}/single_node.json", "r")) +@pytest.fixture +def single_edge(): + return json.load(open(f"{json_dir}/single_edge.json", "r")) + @pytest.fixture def bad_top_level_field(): return json.load(open(f"{json_dir}/bad_top_level_field.json", "r")) diff --git a/tests/test_files/single_edge.json b/tests/test_files/single_edge.json new file mode 100644 index 0000000..770692e --- /dev/null +++ b/tests/test_files/single_edge.json @@ -0,0 +1 @@ +{"incidences": [], "edges": [ { "edge": 3 } ]} \ No newline at end of file diff --git a/tests/test_schema.py b/tests/test_schema.py index 9fcb436..b3550ba 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -25,6 +25,10 @@ def test_single_node(validator, single_node): validator(single_node) +def test_single_edge(validator, single_edge): + validator(single_edge) + + def test_metadata_as_list(validator, metadata_as_list): with pytest.raises(ValueError): validator(metadata_as_list) From 7602292f5c9abd640d8c719bb424fde0e5a580e0 Mon Sep 17 00:00:00 2001 From: Martin Coll Date: Mon, 26 Aug 2024 17:15:47 -0300 Subject: [PATCH 3/4] refactor --- scripts/hif.py | 125 ++++------------------- tests/conftest.py | 4 + tests/test_files/single_incidence.json | 1 + tests/test_schema.py | 4 + validate_hif.py | 134 +------------------------ 5 files changed, 31 insertions(+), 237 deletions(-) create mode 100644 tests/test_files/single_incidence.json diff --git a/scripts/hif.py b/scripts/hif.py index 3aeded4..e91532a 100644 --- a/scripts/hif.py +++ b/scripts/hif.py @@ -1,112 +1,25 @@ -import json from collections import defaultdict -from warnings import warn -def validate_hif(path): - with open(path) as file: - # load JSON file - data = json.loads(file.read()) - - # dictionary to store statuses - info = {} - - # check that keys do not deviate from the standard field names - info["valid-field-names"] = 0 - fields = {"network-type", "metadata", "nodes", "edges", "incidences"} - if not set(data).issubset(fields): - warn( - f"Acceptable field names are: {", ".join(fields)}\nand the field names are {", ".join(set(data))}" - ) - info["valid-field-names"] = 1 - - # incidences are required; check that they exist - info["incidences-exist"] = 0 - - if "incidences" not in data: - warn(f"The file must contain an field for incidences.") - info["incidences-exist"] = 1 - - # check network type - info["valid-network-type"] = 0 - - network_types = {"asc", "undirected", "directed"} - if "network-type" in data: - if data["network-type"] not in network_types: - warn( - f"Unsupported network type. Valid types are: {", ".join(network_types)}" - ) - info["valid-network-type"] = 1 - - # check network metadata - info["metadata-dict"] = 0 - - if "metadata" in data: - if not isinstance(data["metadata"], dict): - warn(f"The metadata must be dict-like.") - info["metadata-dict"] = 1 - - # check node attributes - info["node-record-length"] = 0 - info["node-attr-dict"] = 0 - - if "nodes" in data: - for i, record in enumerate(data["nodes"]): - if len(record) != 2: - warn( - f"Each node record must have two entries: an ID and the dictionary of corresponding attributes." - ) - info["node-record-length"] = 1 - - if not isinstance(record[1], dict): - warn(f"The node attributes must be dict-like.") - info["node-attr-dict"] = 1 - - # check edge attributes - info["edge-record-length"] = 0 - info["edge-attr-dict"] = 0 - - if "edges" in data: - for i, record in enumerate(data["edges"]): - if len(record) != 2: - warn( - f"Each edge record must have two entries: an ID and the dictionary of corresponding attributes." - ) - info["edge-record-length"] = 1 - - if not isinstance(record[1], dict): - warn(f"The edge attributes must be dict-like.") - info["edge-attr-dict"] = 1 - - if "incidences" in data: - info["incidence-record-length"] = 0 - info["incidence-attr-dict"] = 0 - +def validate_network_type(data, verbose): + """ + Custom validations for network types + """ + if ( + "network-type" in data + and data["network-type"] == "directed" + and "incidences" in data + ): for i, record in enumerate(data["incidences"]): - if len(record) != 3: - warn( - f"Each incidence record must have three entries: an edge ID, a node ID, and the dictionary of corresponding attributes." - ) - info["incidence-record-length"] = 1 - - if not isinstance(record[2], dict): - warn(f"The incidence attributes must be dict-like.") - info["incidence-attr-dict"] = 1 - - # in the case of directed hypergraphs, each incidence must - # have the "direction" attribute - if "network-type" in data and data["network-type"] == "directed": - data["direction-exists-for-directed"] = 0 - for i, record in enumerate(data["edges"]): if "direction" not in record[2]: - warn( - f"Each incidence record must have have the 'direction' attribute for directed hypergraphs." - ) - data["direction-exists-for-directed"] = 1 + status = 1 + if verbose: + print( + f"Each incidence record must have have the 'direction' attribute for directed hypergraphs." + ) # in the case of simplicial complexes, make sure that the edges are maximal - if "network-type" in data and data["network-type"] == "asc": - data["maximal-edges-for-asc"] = 0 + if "network-type" in data and data["network-type"] == "asc" and "incidences" in data: edgedict = defaultdict(set) for record in data["incidences"]: e = record[0] @@ -115,9 +28,7 @@ def validate_hif(path): for e1, edge1 in edgedict.items(): for e2, edge2 in edgedict.items(): if e1 != e2 and edge1.issubset(edge2): - warn( - f"Only maximal faces should be stored for simplicial complexes." + if verbose: + print( + f"Only maximal faces should be stored for simplicial complexes." ) - data["maximal-edges-for-asc"] = 1 - - return info diff --git a/tests/conftest.py b/tests/conftest.py index 16a3049..b161a8b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,6 +24,10 @@ def single_node(): def single_edge(): return json.load(open(f"{json_dir}/single_edge.json", "r")) +@pytest.fixture +def single_incidence(): + return json.load(open(f"{json_dir}/single_incidence.json", "r")) + @pytest.fixture def bad_top_level_field(): return json.load(open(f"{json_dir}/bad_top_level_field.json", "r")) diff --git a/tests/test_files/single_incidence.json b/tests/test_files/single_incidence.json new file mode 100644 index 0000000..30f703a --- /dev/null +++ b/tests/test_files/single_incidence.json @@ -0,0 +1 @@ +{"incidences": [ { "edge": "abcd", "node": 42 } ]} \ No newline at end of file diff --git a/tests/test_schema.py b/tests/test_schema.py index b3550ba..dda7945 100644 --- a/tests/test_schema.py +++ b/tests/test_schema.py @@ -29,6 +29,10 @@ def test_single_edge(validator, single_edge): validator(single_edge) +def test_single_incidence(validator, single_incidence): + validator(single_incidence) + + def test_metadata_as_list(validator, metadata_as_list): with pytest.raises(ValueError): validator(metadata_as_list) diff --git a/validate_hif.py b/validate_hif.py index 82fcd9f..4dc3a08 100644 --- a/validate_hif.py +++ b/validate_hif.py @@ -1,11 +1,9 @@ import json import sys -from collections import defaultdict import fastjsonschema -# 0 - OK, 1 - bad JSON -status = 0 +from scripts.hif import validate_network_type if len(sys.argv) > 2 and sys.argv[2] == "--silent": @@ -14,136 +12,12 @@ verbose = True # network parameters -filename = "lesmis-hif.json" -schema_filename = "hif_schema.json" - +filename = sys.argv[1] +schema_filename = "schemas/hif_schema_v0.1.0.json" with open(filename) as file, open(schema_filename) as schema_file: # load JSON file validate_schema = fastjsonschema.compile(json.load(schema_file)) data = json.load(file) validate_schema(data) - -# check that keys do not deviate from the standard field names -# DESCRIPTIONS OF THE FIELDS -# "network-type": a string indicating what type of network the dataset is. -# Valid choices currently include: -# - "undirected" (undirected hypergraph with potential multiedges) -# - "asc" (simplicial complex where only the maximal faces are stored) -# - "directed" (directed hypergraph with potential multiedges) -# "metadata": any dataset-level attributes (e.g., name, author, etc.) which must be dict-like -# "nodes": a list of 2-entries, where the first entry of a record is the node ID -# and the second entry is dict-like and stores the associated attributes. -# "edges": a list of 2-entries, where the first entry of a record is the edge ID -# and the second entry is dict-like and stores the associated attributes. -# "incidences": a list of 3-entries, where the first entry of a record is the edge ID, -# the second entry is the node ID, and the third entry is dict-like and -# stores the associated attributes. -# **Note that this is the only required field. - -fields = {"network-type", "metadata", "nodes", "edges", "incidences"} -if not set(data).issubset(fields): - status = 1 - if verbose: - field_names = ", ".join(fields) - new_field_names = ", ".join(set(data)) - print( - f"Acceptable field names are: {field_names}\nand the field names are {new_field_names}" - ) - -# incidences are required -if "incidences" not in data: - status = 1 - if verbose: - print(f"The file must contain an field for incidences.") - -# check network type -network_types = {"asc", "undirected", "directed"} -if "network-type" in data: - if data["network-type"] not in network_types: - status = 1 - - if verbose: - network_types = ", ".join(network_types) - print(f"Unsupported network type. Valid types are: {network_types}") - -# check network metadata -if "metadata" in data: - if not isinstance(data["metadata"], dict): - status = 1 - if verbose: - print(f"The metadata must be dict-like.") - -# check node attributes -if "nodes" in data: - for i, record in enumerate(data["nodes"]): - if len(record) != 2: - status = 1 - if verbose: - print( - f"Each node record must have two entries: an ID and the dictionary of corresponding attributes." - ) - if not isinstance(record[1], dict): - status = 1 - if verbose: - print(f"The node attributes must be dict-like.") - -# check edge attributes -if "edges" in data: - for i, record in enumerate(data["edges"]): - if len(record) != 2: - status = 1 - if verbose: - print( - f"Each edge record must have two entries: an ID and the dictionary of corresponding attributes." - ) - if not isinstance(record[1], dict): - status = 1 - if verbose: - print(f"The edge attributes must be dict-like.") - -if "incidences" in data: - for i, record in enumerate(data["incidences"]): - if len(record) != 3: - status = 1 - if verbose: - print( - f"Each incidence record must have three entries: an edge ID, a node ID, and the dictionary of corresponding attributes." - ) - if not isinstance(record[2], dict): - status = 1 - if verbose: - print(f"The incidence attributes must be dict-like.") - -# in the case of directed hypergraphs, each incidence must -# have the "direction" attribute -if ( - "network-type" in data - and data["network-type"] == "directed" - and "incidences" in data -): - for i, record in enumerate(data["incidences"]): - if "direction" not in record[2]: - status = 1 - if verbose: - print( - f"Each incidence record must have have the 'direction' attribute for directed hypergraphs." - ) - -# in the case of simplicial complexes, make sure that the edges are maximal -if "network-type" in data and data["network-type"] == "asc" and "incidences" in data: - edgedict = defaultdict(set) - for record in data["incidences"]: - e = record[0] - n = record[1] - edgedict[e].add(n) - for e1, edge1 in edgedict.items(): - for e2, edge2 in edgedict.items(): - if e1 != e2 and edge1.issubset(edge2): - status = 1 - if verbose: - print( - f"Only maximal faces should be stored for simplicial complexes." - ) - -print(f"Exit status {status}.") + validate_network_type(data, verbose) From a95ebd3c1f2309fcafe47f274a6ade0dcfc4b0ae Mon Sep 17 00:00:00 2001 From: Martin Coll Date: Tue, 27 Aug 2024 13:08:15 -0300 Subject: [PATCH 4/4] missing dep --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 1d875ac..d0db9c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ fastjsonschema +networkx pytest requests \ No newline at end of file