From d1a381aa800cfef52087827a1b1414f4e6019765 Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 5 May 2023 12:36:19 -0400 Subject: [PATCH] Assorted minor code improvements --- src/nidm/core/Constants.py | 16 +- src/nidm/core/provone.py | 4 +- src/nidm/core/serializers/__init__.py | 2 - src/nidm/core/serializers/provonerdf.py | 28 +- src/nidm/experiment/Acquisition.py | 5 +- src/nidm/experiment/Core.py | 73 +- src/nidm/experiment/Derivative.py | 5 +- src/nidm/experiment/Navigate.py | 36 +- src/nidm/experiment/Query.py | 18 +- src/nidm/experiment/Utils.py | 1025 ++++++++--------- src/nidm/experiment/tools/bidsmri2nidm.py | 17 +- src/nidm/experiment/tools/csv2nidm.py | 21 +- src/nidm/experiment/tools/nidm2bids.py | 43 +- .../tools/nidm_affinity_propagation.py | 62 +- .../tools/nidm_agglomerative_clustering.py | 54 +- src/nidm/experiment/tools/nidm_gmm.py | 206 ++-- src/nidm/experiment/tools/nidm_kmeans.py | 78 +- src/nidm/experiment/tools/nidm_linreg.py | 143 +-- src/nidm/experiment/tools/nidm_query.py | 3 +- .../tools/repronim_simple2_brainvolumes.py | 16 +- src/nidm/experiment/tools/rest.py | 43 +- tests/experiment/test_experiment_basic.py | 6 +- tests/experiment/test_map_vars_to_terms.py | 18 +- tests/experiment/test_query.py | 16 +- tests/experiment/tools/test_rest.py | 33 +- 25 files changed, 854 insertions(+), 1117 deletions(-) diff --git a/src/nidm/core/Constants.py b/src/nidm/core/Constants.py index 430ea6b..bde3724 100644 --- a/src/nidm/core/Constants.py +++ b/src/nidm/core/Constants.py @@ -588,17 +588,13 @@ def __init__(self, namespaces=None): PROVONE_ATTRIBUTES = ( PROVONE_ATTRIBUTE_QNAMES | PROV_ATTRIBUTE_QNAMES | PROV_ATTRIBUTE_LITERALS ) -PROVONE_RECORD_ATTRIBUTES = list((attr, str(attr)) for attr in PROVONE_ATTRIBUTES) +PROVONE_RECORD_ATTRIBUTES = [(attr, str(attr)) for attr in PROVONE_ATTRIBUTES] -PROV_RECORD_IDS_MAP = dict( - (PROV_N_MAP[rec_type_id], rec_type_id) for rec_type_id in PROV_N_MAP -) -PROVONE_ID_ATTRIBUTES_MAP = dict( - (prov_id, attribute) for (prov_id, attribute) in PROVONE_RECORD_ATTRIBUTES -) -PROVONE_ATTRIBUTES_ID_MAP = dict( - (attribute, prov_id) for (prov_id, attribute) in PROVONE_RECORD_ATTRIBUTES -) +PROV_RECORD_IDS_MAP = {value: rec_type_id for rec_type_id, value in PROV_N_MAP.items()} +PROVONE_ID_ATTRIBUTES_MAP = dict(PROVONE_RECORD_ATTRIBUTES) +PROVONE_ATTRIBUTES_ID_MAP = { + attribute: prov_id for (prov_id, attribute) in PROVONE_RECORD_ATTRIBUTES +} # ADDED BY DBK to make searching NIDM-Experiment Terms easier...temporary, should be done in the OWL file # diff --git a/src/nidm/core/provone.py b/src/nidm/core/provone.py index b7c3428..b140267 100644 --- a/src/nidm/core/provone.py +++ b/src/nidm/core/provone.py @@ -100,8 +100,6 @@ class ProvPlan(ProvEntity): ProvONE Plan element """ - pass - class Process(ProvEntity): """ @@ -905,7 +903,7 @@ def serialize(self, destination=None, format="json", **args): # noqa: A002 serializer.serialize(stream, **args) else: location = destination - scheme, netloc, path, params, _query, fragment = urlparse(location) + _, netloc, path, _, _, _ = urlparse(location) if netloc != "": print( "WARNING: not saving as location " + "is not a local file reference" diff --git a/src/nidm/core/serializers/__init__.py b/src/nidm/core/serializers/__init__.py index f12ec73..3635824 100644 --- a/src/nidm/core/serializers/__init__.py +++ b/src/nidm/core/serializers/__init__.py @@ -38,8 +38,6 @@ def deserialize(self, stream, **kwargs): class DoNotExist(Error): """Exception for the case a serializer is not available.""" - pass - class Registry: """Registry of serializers.""" diff --git a/src/nidm/core/serializers/provonerdf.py b/src/nidm/core/serializers/provonerdf.py index 6a29f39..6711646 100644 --- a/src/nidm/core/serializers/provonerdf.py +++ b/src/nidm/core/serializers/provonerdf.py @@ -309,10 +309,10 @@ def encode_container(self, bundle, container=None, identifier=None): rec_uri = rec_type.uri for attr_name, val in record.extra_attributes: if attr_name == PROV["type"]: - if ( - PROV["Revision"] == val - or PROV["Quotation"] == val - or PROV["PrimarySource"] == val + if val in ( + PROV["Revision"], + PROV["Quotation"], + PROV["PrimarySource"], ): qualifier = val._localpart rec_uri = val.uri @@ -453,8 +453,8 @@ def decode_container(self, graph, bundle): PROV_CLS_MAP = {} formal_attributes = {} unique_sets = {} - for key, _ in PROV_BASE_CLS.items(): - PROV_CLS_MAP[key.uri] = PROV_BASE_CLS[key] + for key, value in PROV_BASE_CLS.items(): + PROV_CLS_MAP[key.uri] = value relation_mapper = { URIRef(PROV["alternateOf"].uri): "alternate", URIRef(PROV["actedOnBehalfOf"].uri): "delegation", @@ -575,9 +575,7 @@ def decode_container(self, graph, bundle): obj1 = self.decode_rdf_representation(obj, graph) if obj is not None and obj1 is None: raise ValueError(("Error transforming", obj)) - pred_new = pred - if pred in predicate_mapper: - pred_new = predicate_mapper[pred] + pred_new = predicate_mapper.get(pred, pred) if ids[id_] == PROV_COMMUNICATION and "activity" in str(pred_new): pred_new = PROV_ATTR_INFORMANT if ids[id_] == PROV_DELEGATION and "agent" in str(pred_new): @@ -605,12 +603,10 @@ def decode_container(self, graph, bundle): if local_key in ids: if "qualified" in pred: formal_attributes[local_key][ - list(formal_attributes[local_key].keys())[0] + next(iter(formal_attributes[local_key])) ] = id_ - for id_ in ids: - attrs = None - if id_ in other_attributes: - attrs = other_attributes[id_] + for id_, idvalue in ids.items(): + attrs = other_attributes.get(id_) items_to_walk = [] for qname, values in unique_sets[id_].items(): if values and len(values) > 1: @@ -619,9 +615,9 @@ def decode_container(self, graph, bundle): for subset in list(walk(items_to_walk)): for key, value in subset.items(): formal_attributes[id_][key] = value - bundle.new_record(ids[id_], id_, formal_attributes[id_], attrs) + bundle.new_record(idvalue, id_, formal_attributes[id_], attrs) else: - bundle.new_record(ids[id_], id_, formal_attributes[id_], attrs) + bundle.new_record(idvalue, id_, formal_attributes[id_], attrs) ids[id_] = None if attrs is not None: other_attributes[id_] = [] diff --git a/src/nidm/experiment/Acquisition.py b/src/nidm/experiment/Acquisition.py index fea7a10..ad42c40 100644 --- a/src/nidm/experiment/Acquisition.py +++ b/src/nidm/experiment/Acquisition.py @@ -83,10 +83,7 @@ def acquisition_object_exists(self, uuid): :param uuid: full uuid of acquisition :return: True if exists, False otherwise """ - if uuid in self._acquisition_objects: - return True - else: - return False + return bool(uuid in self._acquisition_objects) def __str__(self): return "NIDM-Experiment Acquisition Class" diff --git a/src/nidm/experiment/Core.py b/src/nidm/experiment/Core.py index 29bf42b..e85b16d 100644 --- a/src/nidm/experiment/Core.py +++ b/src/nidm/experiment/Core.py @@ -16,7 +16,7 @@ def getUUID(): uid = str(uuid.uuid1()) # added to address some weird bug in rdflib where if the uuid starts with a number, everything up until the first # alapha character becomes a prefix... - if not (re.match("^[a-fA-F]+.*", uid)): + if not re.match("^[a-fA-F]+.*", uid): # if first digit is not a character than replace it with a randomly selected hex character (a-f). uid_temp = uid randint = random.randint(0, 5) @@ -116,11 +116,7 @@ def checkNamespacePrefix(self, prefix): :return: True if prefix exists, False if not """ # check if prefix already exists - if prefix in self.graph._namespaces.keys(): - # prefix already exists - return True - else: - return False + return bool(prefix in self.graph._namespaces) def safe_string(self, string): return ( @@ -538,53 +534,42 @@ def save_DotGraph(self, filename, format=None): # noqa: A002 project_uuid = str(row[0]) # for each Project uuid search dot structure for Project uuid project_node = None - for key, _ in dot.obj_dict["nodes"].items(): + for key, value in dot.obj_dict["nodes"].items(): # get node number in DOT graph for Project - if "URL" in dot.obj_dict["nodes"][key][0]["attributes"]: - if project_uuid in str( - dot.obj_dict["nodes"][key][0]["attributes"]["URL"] - ): - project_node = key - break + if project_uuid in str(value[0]["attributes"].get("URL", "")): + project_node = key + break # for each Session in Project class self.sessions list, find node numbers in DOT graph for session in self.sessions: print(session) - for key, _ in dot.obj_dict["nodes"].items(): + for key, value in dot.obj_dict["nodes"].items(): # get node number in DOT graph for Project - if "URL" in dot.obj_dict["nodes"][key][0]["attributes"]: - if session.identifier.uri in str( - dot.obj_dict["nodes"][key][0]["attributes"]["URL"] - ): - session_node = key - # print(f"session node = {key}") - - # add to DOT structure edge between project_node and session_node - dot.add_edge(Edge(session_node, project_node, **style)) - - # for each Acquisition in Session class ._acquisitions list, find node numbers in DOT graph - for acquisition in session.get_acquisitions(): - # search through the nodes again to figure out node number for acquisition - for key, _ in dot.obj_dict["nodes"].items(): - # get node number in DOT graph for Project - if "URL" in dot.obj_dict["nodes"][key][0]["attributes"]: - if acquisition.identifier.uri in str( - dot.obj_dict["nodes"][key][0]["attributes"][ - "URL" - ] - ): - acquisition_node = key - # print(f"acquisition node = {key}") - - dot.add_edge( - Edge( - acquisition_node, session_node, **style - ) - ) + if session.identifier.uri in str(value[0]["attributes"].get("URL", "")): + session_node = key + # print(f"session node = {key}") + + # add to DOT structure edge between project_node and session_node + dot.add_edge(Edge(session_node, project_node, **style)) + + # for each Acquisition in Session class ._acquisitions list, find node numbers in DOT graph + for acquisition in session.get_acquisitions(): + # search through the nodes again to figure out node number for acquisition + for key, value in dot.obj_dict["nodes"].items(): + # get node number in DOT graph for Project + if acquisition.identifier.uri in str( + value[0]["attributes"].get("URL", "") + ): + acquisition_node = key + # print(f"acquisition node = {key}") + + dot.add_edge( + Edge(acquisition_node, session_node, **style) + ) # add some logic to find nodes with dct:hasPart relation and add those edges to graph...prov_to_dot ignores these - if not (format == "None"): + if format != "None": dot.write(filename, format=format) else: dot.write(filename, format="pdf") diff --git a/src/nidm/experiment/Derivative.py b/src/nidm/experiment/Derivative.py index 7b71e5d..186db14 100644 --- a/src/nidm/experiment/Derivative.py +++ b/src/nidm/experiment/Derivative.py @@ -72,10 +72,7 @@ def derivative_object_exists(self, uuid): :param uuid: full uuid of derivative object :return: True if exists, False otherwise """ - if uuid in self._derivative_objects: - return True - else: - return False + return bool(uuid in self._derivative_objects) def __str__(self): return "NIDM-Experiment Derivative Activity Class" diff --git a/src/nidm/experiment/Navigate.py b/src/nidm/experiment/Navigate.py index 52f443e..dd0ed64 100644 --- a/src/nidm/experiment/Navigate.py +++ b/src/nidm/experiment/Navigate.py @@ -139,7 +139,7 @@ def getNamespaceLookup(nidm_file_tuples): for f in nidm_file_tuples: rdf_graph = OpenGraph(f) for prefix, uri in rdf_graph.namespace_manager.namespaces(): - if not str(uri) in names: + if str(uri) not in names: names[str(uri)] = prefix return names @@ -504,18 +504,14 @@ def GetDataelementDetails(nidm_files_tuple, dataelement): dti = getDataTypeInfo(rdf_graph, de_uri) # check if this is the correct one - if not ( - dataelement - in [ - str(dti["label"]), - str(dti["dataElement"]), - str(dti["dataElementURI"]), - ] - ): + if dataelement not in [ + str(dti["label"]), + str(dti["dataElement"]), + str(dti["dataElementURI"]), + ]: continue - for key in dti.keys(): - result[key] = dti[key] + result.update(dti) result["inProjects"] = set() # figure out what project the dataelement was used in @@ -542,24 +538,20 @@ def GetDataelementDetails(nidm_files_tuple, dataelement): return result # found it, we are done - if result == {}: # didn't find it yet, check the CDEs + if not result: # didn't find it yet, check the CDEs cde_graph = nidm.experiment.CDE.getCDEs() for de_uri in cde_graph.subjects(predicate=isa): dti = getDataTypeInfo(cde_graph, de_uri) # check if this is the correct one - if not ( - dataelement - in [ - str(dti["label"]), - str(dti["dataElement"]), - str(dti["dataElementURI"]), - ] - ): + if dataelement not in [ + str(dti["label"]), + str(dti["dataElement"]), + str(dti["dataElementURI"]), + ]: continue - for key in dti.keys(): - result[key] = dti[key] + result.update(dti) result["inProjects"] = set() result["inProjects"].add("Common Data Element") diff --git a/src/nidm/experiment/Query.py b/src/nidm/experiment/Query.py index 650f504..89b08f4 100644 --- a/src/nidm/experiment/Query.py +++ b/src/nidm/experiment/Query.py @@ -36,7 +36,7 @@ def sparql_query_nidm(nidm_file_list, query, output_file=None, return_graph=Fals :return: dataframe | graph depending on return_graph parameter """ - if "BLAZEGRAPH_URL" in environ.keys(): + if "BLAZEGRAPH_URL" in environ: try: # first make sure all files are loaded into blazegraph for nidm_file in nidm_file_list: @@ -902,14 +902,10 @@ def CheckSubjectMatchesFilter( instrument_details = GetParticipantInstrumentData( nidm_file_list, project_uuid, subject_uuid ) - for instrument_uuid in instrument_details: - for instrument_term in instrument_details[instrument_uuid]: + for terms in instrument_details.values(): + for instrument_term, v in terms.items(): if instrument_term in synonyms: - found_match = filterCompare( - instrument_details[instrument_uuid][instrument_term], - op, - value, - ) + found_match = filterCompare(v, op, value) if found_match: break @@ -921,8 +917,8 @@ def CheckSubjectMatchesFilter( derivatives_details = GetDerivativesDataForSubject( nidm_file_list, project_uuid, subject_uuid ) - for key in derivatives_details: - derivatives = derivatives_details[key]["values"] + for details in derivatives_details.values(): + derivatives = details["values"] for ( vkey ) in ( @@ -1345,7 +1341,7 @@ def OpenGraph(file): return file # If we have a Blazegraph instance, load the data then do the rest - if "BLAZEGRAPH_URL" in environ.keys(): + if "BLAZEGRAPH_URL" in environ: try: with open(file) as f: data = f.read() diff --git a/src/nidm/experiment/Utils.py b/src/nidm/experiment/Utils.py index 73c769c..a35dcda 100644 --- a/src/nidm/experiment/Utils.py +++ b/src/nidm/experiment/Utils.py @@ -54,8 +54,7 @@ INTERLEX_PREFIX = "ilx_" INTERLEX_ENDPOINT = "https://scicrunch.org/api/1/" else: - print("ERROR: Interlex mode can only be 'test' or 'production'") - exit(1) + raise RuntimeError("ERROR: Interlex mode can only be 'test' or 'production'") def safe_string(s): @@ -99,24 +98,19 @@ def read_nidm(nidmDoc): print(f"Error reading NIDM-Exp Document {nidmDoc}, Must have Project Object") print() create_obj = input("Should read_nidm create a Project object for you [yes]: ") - if create_obj == "yes" or create_obj == "": + if create_obj in ("yes", ""): project = Project(empty_graph=True, add_default_type=True) # add namespaces to prov graph for name, namespace in rdf_graph_parse.namespaces(): # skip these default namespaces in prov Document - if ( - (name != "prov") - and (name != "xsd") - and (name != "nidm") - and (name != "niiri") - ): + if name not in ("prov", "xsd", "nidm", "niiri"): project.graph.add_namespace(name, namespace) else: - exit(1) + sys.exit(1) else: # Split subject URI into namespace, term - nm, project_uuid = split_uri(proj_id) + _, project_uuid = split_uri(proj_id) # create empty prov graph project = Project(empty_graph=True, uuid=project_uuid, add_default_type=False) @@ -124,12 +118,7 @@ def read_nidm(nidmDoc): # add namespaces to prov graph for name, namespace in rdf_graph_parse.namespaces(): # skip these default namespaces in prov Document - if ( - (name != "prov") - and (name != "xsd") - and (name != "nidm") - and (name != "niiri") - ): + if name not in ("prov", "xsd", "nidm", "niiri"): project.graph.add_namespace(name, namespace) # Cycle through Project metadata adding to prov graph @@ -145,7 +134,7 @@ def read_nidm(nidmDoc): # print(f"session: {s}") # Split subject URI for session into namespace, uuid - nm, session_uuid = split_uri(s) + _, session_uuid = split_uri(s) # print(f"session uuid= {session_uuid}") @@ -164,8 +153,8 @@ def read_nidm(nidmDoc): predicate=Constants.DCT["isPartOf"], object=s ): # Split subject URI for session into namespace, uuid - nm, acq_uuid = split_uri(acq) - # print(f"acquisition uuid: {acq_uuid}") + _, acq_uuid = split_uri(acq) + # print("acquisition uuid:", acq_uuid) # query for whether this is an AssessmentAcquisition of other Acquisition, etc. for rdf_type in rdf_graph_parse.objects(subject=acq, predicate=RDF.type): @@ -176,8 +165,8 @@ def read_nidm(nidmDoc): predicate=Constants.PROV["wasGeneratedBy"], object=acq ): # Split subject URI for acquisition object (entity) into namespace, uuid - nm, acq_obj_uuid = split_uri(acq_obj) - # print(f"acquisition object uuid: {acq_obj_uuid}") + _, acq_obj_uuid = split_uri(acq_obj) + # print("acquisition object uuid:", acq_obj_uuid) # query for whether this is an MRI acquisition by way of looking at the generated entity and determining # if it has the tuple [uuid Constants.NIDM_ACQUISITION_MODALITY Constants.NIDM_MRI] @@ -232,8 +221,8 @@ def read_nidm(nidmDoc): URIRef(Constants.NIDM_MRI_BOLD_EVENTS._uri), ) in rdf_graph: # Split subject URI for associated acquisition entity for nidm:StimulusResponseFile into namespace, uuid - nm, assoc_acq_uuid = split_uri(assoc_acq) - # print(f"associated acquisition object (stimulus file) uuid: {assoc_acq_uuid}") + _, assoc_acq_uuid = split_uri(assoc_acq) + # print("associated acquisition object (stimulus file) uuid:", assoc_acq_uuid) # if so then add this entity and associate it with acquisition activity and MRI entity events_obj = AcquisitionObject( acquisition=acquisition, uuid=assoc_acq_uuid @@ -492,9 +481,7 @@ def get_RDFliteral_type(rdf_literal): if rdf_literal.datatype == XSD["integer"]: # return (int(rdf_literal)) return pm.Literal(rdf_literal, datatype=pm.XSD["integer"]) - elif (rdf_literal.datatype == XSD["float"]) or ( - rdf_literal.datatype == XSD["double"] - ): + elif rdf_literal.datatype in (XSD["float"], XSD["double"]): # return(float(rdf_literal)) return pm.Literal(rdf_literal, datatype=pm.XSD["float"]) else: @@ -555,10 +542,10 @@ def add_metadata_for_subject(rdf_graph, subject_uri, namespaces, nidm_obj): obj_nm, obj_term = split_uri(objects) # added because PyNIDM agent, activity, and entity classes already add the type - if ( - (objects == Constants.PROV["Activity"]) - or (objects == Constants.PROV["Agent"]) - or (objects == Constants.PROV["Entity"]) + if objects in ( + Constants.PROV["Activity"], + Constants.PROV["Agent"], + Constants.PROV["Entity"], ): continue # special case if obj_nm is prov, xsd, or nidm namespaces. These are added @@ -900,7 +887,7 @@ def QuerySciCrunchElasticSearch( f"ERROR: Valid types for SciCrunch query are 'cde','pde', or 'fde'. You set type: {type} " ) print("ERROR: in function Utils.py/QuerySciCrunchElasticSearch") - exit(1) + sys.exit(1) response = requests.post( "https://scicrunch.org/api/1/elastic-ilx/interlex/term/_search#", @@ -1297,8 +1284,8 @@ def getSubjIDColumn(column_to_terms, df): # look at column_to_terms dictionary for NIDM URL for subject id (Constants.NIDM_SUBJECTID) id_field = None - for key, _ in column_to_terms.items(): - if Constants.NIDM_SUBJECTID._str == column_to_terms[key]["label"]: + for key, value in column_to_terms.items(): + if Constants.NIDM_SUBJECTID._str == value["label"]: id_field = key # if we couldn't find a subject ID field in column_to_terms, ask user @@ -1396,22 +1383,14 @@ def match_participant_id_field(source_variable): This function will test whether the source_variable is a participant ID field or not by string matching. :param source_variable: source variable string to test """ - - if ( - ("participant_id" in source_variable.lower()) - or ("subject_id" in source_variable.lower()) - or ( - ("participant" in source_variable.lower()) - and ("id" in source_variable.lower()) - ) - or ( - ("subject" in source_variable.lower()) and ("id" in source_variable.lower()) - ) - or (("sub" in source_variable.lower()) and ("id" in source_variable.lower())) - ): - return True - else: - return False + source_variable = source_variable.lower() + return ( + "participant_id" in source_variable + or "subject_id" in source_variable + or ("participant" in source_variable and "id" in source_variable) + or ("subject" in source_variable and "id" in source_variable) + or ("sub" in source_variable and "id" in source_variable) + ) def map_variables_to_terms( @@ -1449,8 +1428,8 @@ def map_variables_to_terms( with open(json_source, "r") as f: json_map = json.load(f) else: - print(f"ERROR: Can't open json mapping file: {json_source}") - exit() + print("ERROR: Can't open json mapping file:", json_source) + sys.exit() except Exception: # if not then it's a json structure already json_map = json_source @@ -1460,7 +1439,7 @@ def map_variables_to_terms( "ERROR: Invalid JSON file supplied. Please check your JSON file with a validator first!" ) print("exiting!") - exit() + sys.exit() # if no JSON mapping file was specified then create a default one for variable-term mappings # create a json_file filename from the output file filename @@ -1496,8 +1475,7 @@ def map_variables_to_terms( current_tuple = str(DD(source=assessment_name, variable=column)) # if we loaded a json file with existing mappings - try: - json_map + if json_source is not None: # try: # check for column in json file try: @@ -1511,501 +1489,422 @@ def map_variables_to_terms( .lstrip("'") .rstrip("'") ] - except Exception as e: - if "list index out of range" in str(e): - json_key = [ - key for key in json_map if column.lstrip().rstrip() == key - ] + except IndexError: + json_key = [key for key in json_map if column.lstrip().rstrip() == key] - finally: - if (json_map is not None) and (len(json_key) > 0): - column_to_terms[current_tuple] = {} - - # added in case for some reason there isn't a label key, try source_variable and if it's - # a key then add this as the label as well. - if "label" not in json_map[json_key[0]].keys(): - if "source_variable" in json_map[json_key[0]].keys(): - column_to_terms[current_tuple]["label"] = json_map[ - json_key[0] - ]["source_variable"] - elif "sourceVariable" in json_map[json_key[0]].keys(): - column_to_terms[current_tuple]["label"] = json_map[ - json_key[0] - ]["sourceVariable"] - else: - column_to_terms[current_tuple]["label"] = "" - print( - "No label or source_variable or sourceVariable keys found in json mapping file for variable " - f"{json_key[0]}. Consider adding these to the json file as they are important" - ) - else: - column_to_terms[current_tuple]["label"] = json_map[json_key[0]][ - "label" - ] - # added this bit to account for BIDS json files using "Description" whereas we use "description" - # everywhere else - if "description" in json_map[json_key[0]].keys(): - column_to_terms[current_tuple]["description"] = json_map[ - json_key[0] - ]["description"] - elif "Description" in json_map[json_key[0]].keys(): - column_to_terms[current_tuple]["description"] = json_map[ - json_key[0] - ]["Description"] - else: - column_to_terms[current_tuple]["description"] = "" - # column_to_terms[current_tuple]['variable'] = json_map[json_key[0]]['variable'] + if json_map is not None and len(json_key) > 0: + column_to_terms[current_tuple] = {} - print("\n" + ("*" * 85)) - print( - f"Column {column} already annotated in user supplied JSON mapping file" - ) - print("label:", column_to_terms[current_tuple]["label"]) - print("description:", column_to_terms[current_tuple]["description"]) - if "url" in json_map[json_key[0]]: - column_to_terms[current_tuple]["url"] = json_map[json_key[0]][ - "url" + # added in case for some reason there isn't a label key, try source_variable and if it's + # a key then add this as the label as well. + if "label" not in json_map[json_key[0]]: + if "source_variable" in json_map[json_key[0]]: + column_to_terms[current_tuple]["label"] = json_map[json_key[0]][ + "source_variable" ] - print("url:", column_to_terms[current_tuple]["url"]) - # print("Variable:", column_to_terms[current_tuple]['variable']) - if "sameAs" in json_map[json_key[0]]: - column_to_terms[current_tuple]["sameAs"] = json_map[ - json_key[0] - ]["sameAs"] - print("sameAs:", column_to_terms[current_tuple]["sameAs"]) - if "url" in json_map[json_key[0]]: - column_to_terms[current_tuple]["url"] = json_map[json_key[0]][ - "url" + elif "sourceVariable" in json_map[json_key[0]].keys(): + column_to_terms[current_tuple]["label"] = json_map[json_key[0]][ + "sourceVariable" ] - print("url:", column_to_terms[current_tuple]["url"]) - - if "source_variable" in json_map[json_key[0]]: - column_to_terms[current_tuple]["source_variable"] = json_map[ - json_key[0] - ]["source_variable"] - print( - "source variable:", - column_to_terms[current_tuple]["source_variable"], - ) - elif "sourceVariable" in json_map[json_key[0]]: - column_to_terms[current_tuple]["source_variable"] = json_map[ - json_key[0] - ]["sourceVariable"] - print( - "source variable:", - column_to_terms[current_tuple]["source_variable"], - ) else: - # add source variable if not there... - column_to_terms[current_tuple]["source_variable"] = str(column) - print(f"Added source variable ({column}) to annotations") - - if "associatedWith" in json_map[json_key[0]]: - column_to_terms[current_tuple]["associatedWith"] = json_map[ - json_key[0] - ]["associatedWith"] - print( - "associatedWith:", - column_to_terms[current_tuple]["associatedWith"], - ) - if "allowableValues" in json_map[json_key[0]]: - column_to_terms[current_tuple]["allowableValues"] = json_map[ - json_key[0] - ]["allowableValues"] + column_to_terms[current_tuple]["label"] = "" print( - "allowableValues:", - column_to_terms[current_tuple]["allowableValues"], + "No label or source_variable or sourceVariable keys found in json mapping file for variable " + f"{json_key[0]}. Consider adding these to the json file as they are important" ) + else: + column_to_terms[current_tuple]["label"] = json_map[json_key[0]][ + "label" + ] + # added this bit to account for BIDS json files using "Description" whereas we use "description" + # everywhere else + if "description" in json_map[json_key[0]].keys(): + column_to_terms[current_tuple]["description"] = json_map[ + json_key[0] + ]["description"] + elif "Description" in json_map[json_key[0]].keys(): + column_to_terms[current_tuple]["description"] = json_map[ + json_key[0] + ]["Description"] + else: + column_to_terms[current_tuple]["description"] = "" + # column_to_terms[current_tuple]['variable'] = json_map[json_key[0]]['variable'] - # added to support ReproSchema json format - if "responseOptions" in json_map[json_key[0]]: - for subkey, _ in json_map[json_key[0]][ - "responseOptions" - ].items(): - if "valueType" in subkey: - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple][ - "responseOptions" - ] = {} + print("\n" + ("*" * 85)) + print( + f"Column {column} already annotated in user supplied JSON mapping file" + ) + print("label:", column_to_terms[current_tuple]["label"]) + print("description:", column_to_terms[current_tuple]["description"]) + if "url" in json_map[json_key[0]]: + column_to_terms[current_tuple]["url"] = json_map[json_key[0]]["url"] + print("url:", column_to_terms[current_tuple]["url"]) + # print("Variable:", column_to_terms[current_tuple]['variable']) + if "sameAs" in json_map[json_key[0]]: + column_to_terms[current_tuple]["sameAs"] = json_map[json_key[0]][ + "sameAs" + ] + print("sameAs:", column_to_terms[current_tuple]["sameAs"]) + if "url" in json_map[json_key[0]]: + column_to_terms[current_tuple]["url"] = json_map[json_key[0]]["url"] + print("url:", column_to_terms[current_tuple]["url"]) + + if "source_variable" in json_map[json_key[0]]: + column_to_terms[current_tuple]["source_variable"] = json_map[ + json_key[0] + ]["source_variable"] + print( + "source variable:", + column_to_terms[current_tuple]["source_variable"], + ) + elif "sourceVariable" in json_map[json_key[0]]: + column_to_terms[current_tuple]["source_variable"] = json_map[ + json_key[0] + ]["sourceVariable"] + print( + "source variable:", + column_to_terms[current_tuple]["source_variable"], + ) + else: + # add source variable if not there... + column_to_terms[current_tuple]["source_variable"] = str(column) + print(f"Added source variable ({column}) to annotations") + + if "associatedWith" in json_map[json_key[0]]: + column_to_terms[current_tuple]["associatedWith"] = json_map[ + json_key[0] + ]["associatedWith"] + print( + "associatedWith:", + column_to_terms[current_tuple]["associatedWith"], + ) + if "allowableValues" in json_map[json_key[0]]: + column_to_terms[current_tuple]["allowableValues"] = json_map[ + json_key[0] + ]["allowableValues"] + print( + "allowableValues:", + column_to_terms[current_tuple]["allowableValues"], + ) + # added to support ReproSchema json format + if "responseOptions" in json_map[json_key[0]]: + for subkey in json_map[json_key[0]]["responseOptions"]: + if "valueType" in subkey: + if ( + "responseOptions" + not in column_to_terms[current_tuple].keys() + ): + column_to_terms[current_tuple]["responseOptions"] = {} + + column_to_terms[current_tuple]["responseOptions"][ + "valueType" + ] = json_map[json_key[0]]["responseOptions"]["valueType"] + print( + "valueType:", column_to_terms[current_tuple]["responseOptions"][ "valueType" - ] = json_map[json_key[0]]["responseOptions"][ - "valueType" - ] - print( - "valueType:", - column_to_terms[current_tuple]["responseOptions"][ - "valueType" - ], - ) + ], + ) - elif "minValue" in subkey: - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple][ - "responseOptions" - ] = {} + elif "minValue" in subkey: + if ( + "responseOptions" + not in column_to_terms[current_tuple].keys() + ): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "minValue" + ] = json_map[json_key[0]]["responseOptions"]["minValue"] + print( + "minValue:", column_to_terms[current_tuple]["responseOptions"][ "minValue" - ] = json_map[json_key[0]]["responseOptions"]["minValue"] - print( - "minValue:", - column_to_terms[current_tuple]["responseOptions"][ - "minValue" - ], - ) + ], + ) - elif "maxValue" in subkey: - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple][ - "responseOptions" - ] = {} + elif "maxValue" in subkey: + if ( + "responseOptions" + not in column_to_terms[current_tuple].keys() + ): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "maxValue" + ] = json_map[json_key[0]]["responseOptions"]["maxValue"] + print( + "maxValue:", column_to_terms[current_tuple]["responseOptions"][ "maxValue" - ] = json_map[json_key[0]]["responseOptions"]["maxValue"] - print( - "maxValue:", - column_to_terms[current_tuple]["responseOptions"][ - "maxValue" - ], - ) - elif "choices" in subkey: - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple][ - "responseOptions" - ] = {} + ], + ) + elif "choices" in subkey: + if ( + "responseOptions" + not in column_to_terms[current_tuple].keys() + ): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "choices" + ] = json_map[json_key[0]]["responseOptions"]["choices"] + print( + "levels:", column_to_terms[current_tuple]["responseOptions"][ "choices" - ] = json_map[json_key[0]]["responseOptions"]["choices"] - print( - "levels:", - column_to_terms[current_tuple]["responseOptions"][ - "choices" - ], - ) - elif "hasUnit" in subkey: - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple][ - "responseOptions" - ] = {} + ], + ) + elif "hasUnit" in subkey: + if ( + "responseOptions" + not in column_to_terms[current_tuple].keys() + ): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "unitCode" + ] = json_map[json_key[0]]["responseOptions"]["hasUnit"] + print( + "units:", column_to_terms[current_tuple]["responseOptions"][ "unitCode" - ] = json_map[json_key[0]]["responseOptions"]["hasUnit"] - print( - "units:", - column_to_terms[current_tuple]["responseOptions"][ - "unitCode" - ], - ) - elif "unitCode" in subkey: - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple][ - "responseOptions" - ] = {} + ], + ) + elif "unitCode" in subkey: + if ( + "responseOptions" + not in column_to_terms[current_tuple].keys() + ): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "unitCode" + ] = json_map[json_key[0]]["responseOptions"]["unitCode"] + print( + "units:", column_to_terms[current_tuple]["responseOptions"][ "unitCode" - ] = json_map[json_key[0]]["responseOptions"]["unitCode"] - print( - "units:", - column_to_terms[current_tuple]["responseOptions"][ - "unitCode" - ], - ) + ], + ) - if "levels" in json_map[json_key[0]]: - # upgrade 'levels' to 'responseOptions'->'choices' - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "choices" - ] = json_map[json_key[0]]["levels"] - print( - "choices:", - column_to_terms[current_tuple]["responseOptions"][ - "choices" - ], - ) - elif "Levels" in json_map[json_key[0]]: - # upgrade 'levels' to 'responseOptions'->'choices' - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "choices" - ] = json_map[json_key[0]]["Levels"] - print( - "levels:", - column_to_terms[current_tuple]["responseOptions"][ - "choices" - ], - ) + if "levels" in json_map[json_key[0]]: + # upgrade 'levels' to 'responseOptions'->'choices' + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "choices" + ] = json_map[json_key[0]]["levels"] + print( + "choices:", + column_to_terms[current_tuple]["responseOptions"]["choices"], + ) + elif "Levels" in json_map[json_key[0]]: + # upgrade 'levels' to 'responseOptions'->'choices' + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "choices" + ] = json_map[json_key[0]]["Levels"] + print( + "levels:", + column_to_terms[current_tuple]["responseOptions"]["choices"], + ) - if "valueType" in json_map[json_key[0]]: - # upgrade 'valueType' to 'responseOptions'->'valueType - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "valueType" - ] = json_map[json_key[0]]["valueType"] - print( - "valueType:", - column_to_terms[current_tuple]["responseOptions"][ - "valueType" - ], - ) + if "valueType" in json_map[json_key[0]]: + # upgrade 'valueType' to 'responseOptions'->'valueType + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "valueType" + ] = json_map[json_key[0]]["valueType"] + print( + "valueType:", + column_to_terms[current_tuple]["responseOptions"]["valueType"], + ) - if "minValue" in json_map[json_key[0]]: - # upgrade 'minValue' to 'responseOptions'->'minValue - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "minValue" - ] = json_map[json_key[0]]["minValue"] - print( - "minValue:", - column_to_terms[current_tuple]["responseOptions"][ - "minValue" - ], - ) - elif "minimumValue" in json_map[json_key[0]]: - # upgrade 'minValue' to 'responseOptions'->'minValue - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "minValue" - ] = json_map[json_key[0]]["minimumValue"] - print( - "minValue:", - column_to_terms[current_tuple]["responseOptions"][ - "minValue" - ], - ) + if "minValue" in json_map[json_key[0]]: + # upgrade 'minValue' to 'responseOptions'->'minValue + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "minValue" + ] = json_map[json_key[0]]["minValue"] + print( + "minValue:", + column_to_terms[current_tuple]["responseOptions"]["minValue"], + ) + elif "minimumValue" in json_map[json_key[0]]: + # upgrade 'minValue' to 'responseOptions'->'minValue + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "minValue" + ] = json_map[json_key[0]]["minimumValue"] + print( + "minValue:", + column_to_terms[current_tuple]["responseOptions"]["minValue"], + ) - if "maxValue" in json_map[json_key[0]]: - # upgrade 'maxValue' to 'responseOptions'->'maxValue - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "maxValue" - ] = json_map[json_key[0]]["maxValue"] - print( - "maxValue:", - column_to_terms[current_tuple]["responseOptions"][ - "maxValue" - ], - ) - elif "maximumValue" in json_map[json_key[0]]: - # upgrade 'maxValue' to 'responseOptions'->'maxValue - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "maxValue" - ] = json_map[json_key[0]]["maximumValue"] - print( - "maxValue:", - column_to_terms[current_tuple]["responseOptions"][ - "maxValue" - ], - ) - if "hasUnit" in json_map[json_key[0]]: - # upgrade 'hasUnit' to 'responseOptions'->'unitCode - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "unitCode" - ] = json_map[json_key[0]]["hasUnit"] - print( - "unitCode:", - column_to_terms[current_tuple]["responseOptions"][ - "unitCode" - ], - ) - elif "Units" in json_map[json_key[0]]: - # upgrade 'Units' to 'responseOptions'->'unitCode - if ( - "responseOptions" - not in column_to_terms[current_tuple].keys() - ): - column_to_terms[current_tuple]["responseOptions"] = {} - column_to_terms[current_tuple]["responseOptions"][ - "unitCode" - ] = json_map[json_key[0]]["Units"] - print( - "unitCode:", - column_to_terms[current_tuple]["responseOptions"][ - "unitCode" - ], - ) + if "maxValue" in json_map[json_key[0]]: + # upgrade 'maxValue' to 'responseOptions'->'maxValue + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "maxValue" + ] = json_map[json_key[0]]["maxValue"] + print( + "maxValue:", + column_to_terms[current_tuple]["responseOptions"]["maxValue"], + ) + elif "maximumValue" in json_map[json_key[0]]: + # upgrade 'maxValue' to 'responseOptions'->'maxValue + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "maxValue" + ] = json_map[json_key[0]]["maximumValue"] + print( + "maxValue:", + column_to_terms[current_tuple]["responseOptions"]["maxValue"], + ) + if "hasUnit" in json_map[json_key[0]]: + # upgrade 'hasUnit' to 'responseOptions'->'unitCode + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "unitCode" + ] = json_map[json_key[0]]["hasUnit"] + print( + "unitCode:", + column_to_terms[current_tuple]["responseOptions"]["unitCode"], + ) + elif "Units" in json_map[json_key[0]]: + # upgrade 'Units' to 'responseOptions'->'unitCode + if "responseOptions" not in column_to_terms[current_tuple].keys(): + column_to_terms[current_tuple]["responseOptions"] = {} + column_to_terms[current_tuple]["responseOptions"][ + "unitCode" + ] = json_map[json_key[0]]["Units"] + print( + "unitCode:", + column_to_terms[current_tuple]["responseOptions"]["unitCode"], + ) - if "isAbout" in json_map[json_key[0]]: - # check if we have a single isAbout or multiple... - if isinstance(json_map[json_key[0]]["isAbout"], list): - # isAbout is an empty list, do concept association if user asked for it else skip - if not json_map[json_key[0]]["isAbout"]: - if associate_concepts: - # provide user with opportunity to associate a concept with this annotation - find_concept_interactive( - column, - current_tuple, - column_to_terms, - ilx_obj, - nidm_owl_graph=nidm_owl_graph, - ) - # write annotations to json file so user can start up again if not doing whole file - write_json_mapping_file( - column_to_terms, output_file, bids - ) - else: - pass + if "isAbout" in json_map[json_key[0]]: + # check if we have a single isAbout or multiple... + if isinstance(json_map[json_key[0]]["isAbout"], list): + # isAbout is an empty list, do concept association if user asked for it else skip + if not json_map[json_key[0]]["isAbout"]: + if associate_concepts: + # provide user with opportunity to associate a concept with this annotation + find_concept_interactive( + column, + current_tuple, + column_to_terms, + ilx_obj, + nidm_owl_graph=nidm_owl_graph, + ) + # write annotations to json file so user can start up again if not doing whole file + write_json_mapping_file( + column_to_terms, output_file, bids + ) else: - # else create a new list - column_to_terms[current_tuple]["isAbout"] = [] - # for each isAbout entry - for subdict in json_map[json_key[0]]["isAbout"]: - # some entries may not have 'label' so check - if "label" in subdict.keys(): - column_to_terms[current_tuple][ - "isAbout" - ].append( - { - "@id": subdict["@id"], - "label": subdict["label"], - } - ) - print( - f"isAbout: @id = {subdict['@id']}, label = {subdict['label']}" - ) - else: - column_to_terms[current_tuple][ - "isAbout" - ].append({"@id": subdict["@id"]}) - print(f"isAbout: @id = {subdict['@id']}") - # for isabout_key,isabout_value in subdict.items(): - # column_to_terms[current_tuple]['isAbout'].append({isabout_key:isabout_value}) - # print(f"isAbout: {isabout_key} = {isabout_value}") - # if isAbout is a dictionary then we only have 1 isAbout...we'll upgrade it to a list - # to be consistent moving forward + pass else: + # else create a new list column_to_terms[current_tuple]["isAbout"] = [] - if "url" in json_map[json_key[0]]["isAbout"].keys(): - if "label" in json_map[json_key[0]]["isAbout"].keys(): + # for each isAbout entry + for subdict in json_map[json_key[0]]["isAbout"]: + # some entries may not have 'label' so check + if "label" in subdict.keys(): column_to_terms[current_tuple]["isAbout"].append( { - "@id": json_map[json_key[0]]["isAbout"][ - "url" - ], - "label": json_map[json_key[0]]["isAbout"][ - "label" - ], + "@id": subdict["@id"], + "label": subdict["label"], } ) - else: - column_to_terms[current_tuple]["isAbout"].append( - {"@id": json_map[json_key[0]]["isAbout"]["url"]} - ) - else: - if "label" in json_map[json_key[0]]["isAbout"].keys(): - column_to_terms[current_tuple]["isAbout"].append( - { - "@id": json_map[json_key[0]]["isAbout"][ - "@id" - ], - "label": json_map[json_key[0]]["isAbout"][ - "label" - ], - } + print( + f"isAbout: @id = {subdict['@id']}, label = {subdict['label']}" ) else: column_to_terms[current_tuple]["isAbout"].append( - {"@id": json_map[json_key[0]]["isAbout"]["@id"]} + {"@id": subdict["@id"]} ) - - print( - f"isAbout: @id = {column_to_terms[current_tuple]['isAbout']['@id']}, label = {column_to_terms[current_tuple]['isAbout']['label']}" - ) + print(f"isAbout: @id = {subdict['@id']}") + # for isabout_key,isabout_value in subdict.items(): + # column_to_terms[current_tuple]['isAbout'].append({isabout_key:isabout_value}) + # print(f"isAbout: {isabout_key} = {isabout_value}") + # if isAbout is a dictionary then we only have 1 isAbout...we'll upgrade it to a list + # to be consistent moving forward else: - # if user ran in mode where they want to associate concepts and this isn't the participant - # id field then associate concepts. - if match_participant_id_field( - json_map[json_key[0]]["sourceVariable"] - ): - column_to_terms[current_tuple]["isAbout"] = [] - column_to_terms[current_tuple]["isAbout"].append( - { - "@id": Constants.NIDM_SUBJECTID.uri, - "label": Constants.NIDM_SUBJECTID.localpart, - } - ) - write_json_mapping_file(column_to_terms, output_file, bids) - elif associate_concepts: - # provide user with opportunity to associate a concept with this annotation - find_concept_interactive( - column, - current_tuple, - column_to_terms, - ilx_obj, - nidm_owl_graph=nidm_owl_graph, - ) - # write annotations to json file so user can start up again if not doing whole file - write_json_mapping_file(column_to_terms, output_file, bids) + column_to_terms[current_tuple]["isAbout"] = [] + if "url" in json_map[json_key[0]]["isAbout"].keys(): + if "label" in json_map[json_key[0]]["isAbout"].keys(): + column_to_terms[current_tuple]["isAbout"].append( + { + "@id": json_map[json_key[0]]["isAbout"]["url"], + "label": json_map[json_key[0]]["isAbout"][ + "label" + ], + } + ) + else: + column_to_terms[current_tuple]["isAbout"].append( + {"@id": json_map[json_key[0]]["isAbout"]["url"]} + ) + else: + if "label" in json_map[json_key[0]]["isAbout"].keys(): + column_to_terms[current_tuple]["isAbout"].append( + { + "@id": json_map[json_key[0]]["isAbout"]["@id"], + "label": json_map[json_key[0]]["isAbout"][ + "label" + ], + } + ) + else: + column_to_terms[current_tuple]["isAbout"].append( + {"@id": json_map[json_key[0]]["isAbout"]["@id"]} + ) + + print( + f"isAbout: @id = {column_to_terms[current_tuple]['isAbout']['@id']}, label = {column_to_terms[current_tuple]['isAbout']['label']}" + ) + else: + # if user ran in mode where they want to associate concepts and this isn't the participant + # id field then associate concepts. + if match_participant_id_field( + json_map[json_key[0]]["sourceVariable"] + ): + column_to_terms[current_tuple]["isAbout"] = [] + column_to_terms[current_tuple]["isAbout"].append( + { + "@id": Constants.NIDM_SUBJECTID.uri, + "label": Constants.NIDM_SUBJECTID.localpart, + } + ) + write_json_mapping_file(column_to_terms, output_file, bids) + elif associate_concepts: + # provide user with opportunity to associate a concept with this annotation + find_concept_interactive( + column, + current_tuple, + column_to_terms, + ilx_obj, + nidm_owl_graph=nidm_owl_graph, + ) + # write annotations to json file so user can start up again if not doing whole file + write_json_mapping_file(column_to_terms, output_file, bids) print("*" * 87) print("-" * 87) if (json_map is not None) and (len(json_key) > 0): continue - except Exception as e: - # so if this is an IndexError then it's likely our json mapping file keys are of the BIDS type - # (simply variable names) instead of the more complex NIDM ones DD(file=XX,variable=YY) - - if "NameError" in str(e): - print("json annotation file not supplied") + else: + print("json annotation file not supplied") search_term = str(column) # added for an automatic mapping of participant_id, subject_id, and variants @@ -2045,7 +1944,7 @@ def map_variables_to_terms( print("-" * 87) continue # if we haven't already found an annotation for this column then have user create one. - if current_tuple not in column_to_terms.keys(): + if current_tuple not in column_to_terms: # create empty annotation structure for this source variable column_to_terms[current_tuple] = {} # enter user interaction function to get data dictionary annotations from user @@ -2139,11 +2038,11 @@ def write_json_mapping_file(source_variable_annotations, output_file, bids=False new_dict = {} # remove 'responseOptions' and move 'choices' to 'levels' key - for key, _ in temp_dict.items(): + for key, value in temp_dict.items(): new_dict[key] = {} - for subkey, subvalue in temp_dict[key].items(): + for subkey, subvalue in value.items(): if subkey == "responseOptions": - for subkey2, subvalue2 in temp_dict[key]["responseOptions"].items(): + for subkey2, subvalue2 in value["responseOptions"].items(): if subkey2 == "choices": new_dict[key]["levels"] = subvalue2 else: @@ -2229,8 +2128,8 @@ def find_concept_interactive( ) search_result = {} first_nidm_term = True - for key, _ in nidmterms_concepts_query.items(): - if nidmterms_concepts_query[key]["score"] > min_match_score: + for key, value in nidmterms_concepts_query.items(): + if value["score"] > min_match_score: if first_nidm_term: print() print("NIDM-Terms Concepts:") @@ -2238,20 +2137,16 @@ def find_concept_interactive( print( f"{option}: Label:", - nidmterms_concepts_query[key]["label"], + value["label"], "\t Definition:", - nidmterms_concepts_query[key]["definition"], + value["definition"], "\t URL:", - nidmterms_concepts_query[key]["url"], + value["url"], ) search_result[key] = {} - search_result[key]["label"] = nidmterms_concepts_query[key]["label"] - search_result[key]["definition"] = nidmterms_concepts_query[key][ - "definition" - ] - search_result[key]["preferred_url"] = nidmterms_concepts_query[key][ - "url" - ] + search_result[key]["label"] = value["label"] + search_result[key]["definition"] = value["definition"] + search_result[key]["preferred_url"] = value["url"] search_result[str(option)] = key option = option + 1 @@ -2268,14 +2163,14 @@ def find_concept_interactive( print("InterLex:") print() # print("Search Results: ") - for key, _ in ilx_result.items(): + for key, value in ilx_result.items(): print( f"{option}: Label:", - ilx_result[key]["label"], + value["label"], "\t Definition:", - ilx_result[key]["definition"], + value["definition"], "\t Preferred URL:", - ilx_result[key]["preferred_url"], + value["preferred_url"], ) search_result[key] = {} @@ -2293,8 +2188,8 @@ def find_concept_interactive( cogatlas_concepts.json, search_term ) first_cogatlas_concept = True - for key, _ in cogatlas_concepts_query.items(): - if cogatlas_concepts_query[key]["score"] > min_match_score + 20: + for key, value in cogatlas_concepts_query.items(): + if value["score"] > min_match_score + 20: if first_cogatlas_concept: print() print("Cognitive Atlas:") @@ -2303,22 +2198,18 @@ def find_concept_interactive( print( f"{option}: Label:", - cogatlas_concepts_query[key]["label"], + value["label"], "\t Definition: ", - cogatlas_concepts_query[key]["definition"].rstrip("\r\n"), + value["definition"].rstrip("\r\n"), ) search_result[key] = {} - search_result[key]["label"] = cogatlas_concepts_query[key][ - "label" - ] - search_result[key]["definition"] = cogatlas_concepts_query[key][ - "definition" - ].rstrip("\r\n") - search_result[key]["preferred_url"] = cogatlas_concepts_query[ - key - ]["url"] + search_result[key]["label"] = value["label"] + search_result[key]["definition"] = value["definition"].rstrip( + "\r\n" + ) + search_result[key]["preferred_url"] = value["url"] search_result[str(option)] = key - option = option + 1 + option += 1 except Exception: pass @@ -2327,24 +2218,20 @@ def find_concept_interactive( cogatlas_disorders_query = fuzzy_match_terms_from_cogatlas_json( cogatlas_disorders.json, search_term ) - for key, _ in cogatlas_disorders_query.items(): - if cogatlas_disorders_query[key]["score"] > min_match_score + 20: + for key, value in cogatlas_disorders_query.items(): + if value["score"] > min_match_score + 20: print( f"{option}: Label:", - cogatlas_disorders_query[key]["label"], + value["label"], "\t Definition: ", - cogatlas_disorders_query[key]["definition"].rstrip("\r\n"), + value["definition"].rstrip("\r\n"), ) search_result[key] = {} - search_result[key]["label"] = cogatlas_disorders_query[key][ - "label" - ] - search_result[key]["definition"] = cogatlas_disorders_query[ - key - ]["definition"].rstrip("\r\n") - search_result[key]["preferred_url"] = cogatlas_disorders_query[ - key - ]["url"] + search_result[key]["label"] = value["label"] + search_result[key]["definition"] = value["definition"].rstrip( + "\r\n" + ) + search_result[key]["preferred_url"] = value["url"] search_result[str(option)] = key option = option + 1 except Exception: @@ -2360,8 +2247,8 @@ def find_concept_interactive( ) first_nidm_term = True - for key, _ in nidm_constants_query.items(): - if nidm_constants_query[key]["score"] > min_match_score: + for key, value in nidm_constants_query.items(): + if value["score"] > min_match_score: if first_nidm_term: print() print("NIDM Ontology Terms:") @@ -2369,20 +2256,16 @@ def find_concept_interactive( print( f"{option}: Label:", - nidm_constants_query[key]["label"], + value["label"], "\t Definition:", - nidm_constants_query[key]["definition"], + value["definition"], "\t URL:", - nidm_constants_query[key]["url"], + value["url"], ) search_result[key] = {} - search_result[key]["label"] = nidm_constants_query[key]["label"] - search_result[key]["definition"] = nidm_constants_query[key][ - "definition" - ] - search_result[key]["preferred_url"] = nidm_constants_query[key][ - "url" - ] + search_result[key]["label"] = value["label"] + search_result[key]["definition"] = value["definition"] + search_result[key]["preferred_url"] = value["url"] search_result[str(option)] = key option = option + 1 @@ -2726,16 +2609,12 @@ def DD_UUID(element, dd_struct, dataset_identifier=None): for subkey, subvalue in dd_struct[str(key_tuple)][ "responseOptions" ].items(): - if ( - (subkey == "levels") - or (subkey == "Levels") - or (subkey == "choices") - ): - property_string = property_string + str(subvalue) + if subkey in ("levels", "Levels", "choices"): + property_string += str(subvalue) if subkey == "valueType": - property_string = property_string + str(subvalue) - if (subkey == "hasUnit") or (subkey == "unitCode"): - property_string = property_string + str(subvalue) + property_string += str(subvalue) + if subkey in ("hasUnit", "unitCode"): + property_string += str(subvalue) if key == "source_variable": variable_name = value @@ -2771,7 +2650,7 @@ def DD_to_nidm(dd_struct, dataset_identifier=None): # add the DataElement RDF type in the source namespace key_tuple = eval(key) - for subkey, _ in key_tuple._asdict().items(): + for subkey in key_tuple._asdict().keys(): if subkey == "variable": # item_ns = Namespace(dd_struct[str(key_tuple)]["url"]+"/") # g.bind(prefix=safe_string(item), namespace=item_ns) @@ -2813,7 +2692,7 @@ def DD_to_nidm(dd_struct, dataset_identifier=None): g.add((cde_id, Constants.NIDM["url"], URIRef(value))) elif key == "label": g.add((cde_id, Constants.RDFS["label"], Literal(value))) - elif (key == "levels") or (key == "Levels"): + elif key in ("levels", "Levels"): g.add((cde_id, Constants.NIDM["levels"], Literal(value))) elif key == "source_variable": g.add((cde_id, Constants.NIDM["sourceVariable"], Literal(value))) @@ -2833,7 +2712,7 @@ def DD_to_nidm(dd_struct, dataset_identifier=None): if isinstance(value, list): for subdict in value: for isabout_key, isabout_value in subdict.items(): - if (isabout_key == "@id") or (isabout_key == "url"): + if isabout_key in ("@id", "url"): last_id = isabout_value # add isAbout key which is the url g.add( @@ -2862,7 +2741,7 @@ def DD_to_nidm(dd_struct, dataset_identifier=None): # else we only have 1 isabout which is a dict else: for isabout_key, isabout_value in value.items(): - if (isabout_key == "@id") or (isabout_key == "url"): + if isabout_key in ("@id", "url"): last_id = isabout_value # add isAbout key which is the url g.add( @@ -2885,9 +2764,9 @@ def DD_to_nidm(dd_struct, dataset_identifier=None): elif key == "valueType": g.add((cde_id, Constants.NIDM["valueType"], URIRef(value))) - elif (key == "minValue") or (key == "minimumValue"): + elif key in ("minValue", "minimumValue"): g.add((cde_id, Constants.NIDM["minValue"], Literal(value))) - elif (key == "maxValue") or (key == "maximumValue"): + elif key in ("maxValue", "maximumValue"): g.add((cde_id, Constants.NIDM["maxValue"], Literal(value))) elif key == "hasUnit": g.add((cde_id, Constants.NIDM["unitCode"], Literal(value))) diff --git a/src/nidm/experiment/tools/bidsmri2nidm.py b/src/nidm/experiment/tools/bidsmri2nidm.py index 3a529ed..4c9df8e 100755 --- a/src/nidm/experiment/tools/bidsmri2nidm.py +++ b/src/nidm/experiment/tools/bidsmri2nidm.py @@ -13,6 +13,7 @@ import logging import os from os.path import isfile, join +import sys import bids from pandas import DataFrame from prov.model import PROV_TYPE, Namespace, QualifiedName @@ -216,12 +217,12 @@ def addbidsignore(directory, filename_to_add): # adds filename_to_add to .bidsignore file in directory if not isfile(os.path.join(directory, ".bidsignore")): with open(os.path.join(directory, ".bidsignore"), "w") as text_file: - text_file.write("%s\n" % filename_to_add) + print(filename_to_add, file=text_file) else: with open(os.path.join(directory, ".bidsignore")) as fp: if filename_to_add not in fp.read(): with open(os.path.join(directory, ".bidsignore"), "a") as text_file: - text_file.write("%s\n" % filename_to_add) + print(filename_to_add, file=text_file) def addimagingsessions( @@ -399,7 +400,7 @@ def addimagingsessions( logging.critical( "Error: BIDS directory %s does not exist!", os.path.join(directory) ) - exit(-1) + sys.exit(-1) # add various attributes if they exist in BIDS dataset for key in dataset: @@ -594,7 +595,7 @@ def addimagingsessions( logging.critical( "Error: BIDS directory %s does not exist!", os.path.join(directory) ) - exit(-1) + sys.exit(-1) # add various attributes if they exist in BIDS dataset for key in dataset: @@ -936,12 +937,12 @@ def bidsmri2project(directory, args): logging.critical( "Cannot find dataset_description.json file which is required in the BIDS spec" ) - exit("-1") + sys.exit(-1) else: logging.critical( "Error: BIDS directory %s does not exist!", os.path.join(directory) ) - exit("-1") + sys.exit(-1) # create project / nidm-exp doc project = Project() @@ -996,7 +997,7 @@ def bidsmri2project(directory, args): column_to_terms = {} for field in participants_data.fieldnames: # column is not in BIDS_Constants - if not (field in BIDS_Constants.participants): + if field not in BIDS_Constants.participants: # add column to list for column_to_terms mapping mapping_list.append(field) @@ -1306,7 +1307,7 @@ def bidsmri2project(directory, args): column_to_terms = {} for field in pheno_data.fieldnames: # column is not in BIDS_Constants - if not (field in BIDS_Constants.participants): + if field not in BIDS_Constants.participants: # add column to list for column_to_terms mapping mapping_list.append(field) diff --git a/src/nidm/experiment/tools/csv2nidm.py b/src/nidm/experiment/tools/csv2nidm.py index d3486f2..4cea247 100644 --- a/src/nidm/experiment/tools/csv2nidm.py +++ b/src/nidm/experiment/tools/csv2nidm.py @@ -12,6 +12,7 @@ import os from os.path import basename, dirname, join from shutil import copy2 +import sys import pandas as pd from rdflib import Graph from nidm.core import Constants @@ -131,7 +132,7 @@ def main(): "file types. Please change your input file appropriately and re-run." ) print("no NIDM file created!") - exit(1) + sys.exit(1) # temp = csv.reader(args.csv_file) # df = pd.DataFrame(temp) @@ -190,12 +191,10 @@ def main(): # look at column_to_terms dictionary for NIDM URL for subject id (Constants.NIDM_SUBJECTID) id_field = None - for key, _ in column_to_terms.items(): - if "isAbout" in column_to_terms[key]: - for isabout_key, isabout_value in column_to_terms[key][ - "isAbout" - ].items(): - if (isabout_key == "url") or (isabout_key == "@id"): + for key, value in column_to_terms.items(): + if "isAbout" in value: + for isabout_key, isabout_value in value["isAbout"].items(): + if isabout_key in ("url", "@id"): if isabout_value == Constants.NIDM_SUBJECTID._uri: key_tuple = eval(key) # id_field=key @@ -259,7 +258,7 @@ def main(): # if there was data about this subject in the NIDM file already (i.e. an agent already exists with this subject id) # then add this CSV assessment data to NIDM file, else skip it.... - if not (len(csv_row.index) == 0): + if len(csv_row.index) != 0: logging.info("found participant in CSV file") # create a new session for this assessment @@ -343,12 +342,12 @@ def main(): # look at column_to_terms dictionary for NIDM URL for subject id (Constants.NIDM_SUBJECTID) id_field = None - for key, _ in column_to_terms.items(): + for key, value in column_to_terms.items(): # using isAbout concept association to associate subject identifier variable from csv with a known term # for subject IDs - if "isAbout" in column_to_terms[key]: + if "isAbout" in value: # iterate over isAbout list entries and look for Constants.NIDM_SUBJECTID - for entries in column_to_terms[key]["isAbout"]: + for entries in value["isAbout"]: if Constants.NIDM_SUBJECTID.uri == entries["@id"]: key_tuple = eval(key) id_field = key_tuple.variable diff --git a/src/nidm/experiment/tools/nidm2bids.py b/src/nidm/experiment/tools/nidm2bids.py index 0c21510..381f17b 100644 --- a/src/nidm/experiment/tools/nidm2bids.py +++ b/src/nidm/experiment/tools/nidm2bids.py @@ -14,6 +14,7 @@ from os import mkdir, system from os.path import basename, isdir, isfile, join, splitext from shutil import copyfile +import sys import tempfile import urllib.parse import datalad.api as dl @@ -52,27 +53,27 @@ def GetImageFromAWS(location, output_file, args): # remove everything from location string before openneuro openneuro_loc = location[location.find("openneuro/") + 10 :] # get a temporary directory for this file - temp_dir = tempfile.TemporaryDirectory() + temp_dir = tempfile.mkdtemp() # aws command cmd = ( "aws s3 cp --no-sign-request " + "s3://openneuro.org/" + openneuro_loc + " " - + temp_dir.name + + temp_dir ) # execute command print(cmd) system(cmd) # check if aws command downloaded something - if not isfile(join(temp_dir.name, basename(location))): + if not isfile(join(temp_dir, basename(location))): print("Couldn't get dataset from AWS either...") return None else: try: # copy file from temp_dir to bids dataset print("Copying temporary file to final location....") - copyfile(join(temp_dir.name, basename(location)), output_file) + copyfile(join(temp_dir, basename(location)), output_file) return True except Exception: print("Couldn't get dataset from AWS either...") @@ -86,21 +87,21 @@ def GetImageFromAWS(location, output_file, args): # remove everything from location string before openneuro loc = location[location.find(args.dataset_string) + len(args.dataset_string) :] # get a temporary directory for this file - temp_dir = tempfile.TemporaryDirectory() + temp_dir = tempfile.mkdtemp() # aws command - cmd = "aws s3 cp --no-sign-request " + aws_baseurl + loc + " " + temp_dir.name + cmd = "aws s3 cp --no-sign-request " + aws_baseurl + loc + " " + temp_dir # execute command print(cmd) system(cmd) # check if aws command downloaded something - if not isfile(join(temp_dir.name, basename(location))): + if not isfile(join(temp_dir, basename(location))): print("Couldn't get dataset from AWS either...") return None else: try: # copy file from temp_dir to bids dataset print("Copying temporary file to final location....") - copyfile(join(temp_dir.name, basename(location)), output_file) + copyfile(join(temp_dir, basename(location)), output_file) return True except Exception: print("Couldn't get dataset from AWS either...") @@ -307,7 +308,6 @@ def CreateBIDSParticipantFile(nidm_graph, output_file, participant_fields): # Step(2): Query for prov:Entity that were prov:wasGeneratedBy uris from Step(1) # Step(3): For each metadata triple in objects whose subject is uris from Step(2), fuzzy match predicate after # removing base of uri to "fields" in participants list, then add these to data list for appending to pandas - match_ratio = {} # # Steps(1):(3) @@ -356,8 +356,7 @@ def CreateBIDSParticipantFile(nidm_graph, output_file, participant_fields): # add row to participants DataFrame # participants=participants.append(pd.DataFrame(data)) - participants - row_index = row_index + 1 + row_index += 1 # save participants.tsv file participants.to_csv(output_file + ".tsv", sep="\t", index=False) @@ -389,19 +388,15 @@ def NIDMProject2BIDSDatasetDescriptor(nidm_graph, output_directory): # make copy of project_metadata project_metadata_tmp = dict(project_metadata) # iterate over the temporary dictionary and delete items from the original - for proj_key, _ in project_metadata_tmp.items(): + for proj_key in project_metadata_tmp: key_found = 0 # print(f"proj_key = {proj_key} ") # print(f"project_metadata[proj_key] = {project_metadata[proj_key]}") - for key, _ in BIDS_Constants.dataset_description.items(): - if BIDS_Constants.dataset_description[key]._uri == proj_key: + for key, value in BIDS_Constants.dataset_description.items(): + if value._uri == proj_key: # added since BIDS validator validates values of certain keys - if ( - (key == "Authors") - or (key == "Funding") - or (key == "ReferencesAndLinks") - ): + if key in ("Authors", "Funding", "ReferencesAndLinks"): project_metadata[key] = [project_metadata[proj_key]] else: project_metadata[key] = project_metadata[proj_key] @@ -436,11 +431,7 @@ def AddMetadataToImageSidecar(graph_entity, graph, output_directory, image_filen json_dict = {} for row in qres: key = next( - ( - k - for k in BIDS_Constants.json_keys - if BIDS_Constants.json_keys[k] == row[0] - ), + (k for k, v in BIDS_Constants.json_keys.items() if v == row[0]), None, ) if key is not None: @@ -821,7 +812,7 @@ def main(): " string in your AWS S3 urls then just supply -aws_baseurl with nothing after it." ) print(args.print_help()) - exit(-1) + sys.exit(-1) # set up some local variables rdf_file = args.rdf_file @@ -853,7 +844,7 @@ def main(): "File doesn't appear to be a valid RDF format supported by Python RDFLib! Please check input file" ) print("exiting...") - exit(-1) + sys.exit(-1) # if not os.path.isdir(join(output_directory,os.path.splitext(args.rdf_file)[0])): # os.mkdir(join(output_directory,os.path.splitext(args.rdf_file)[0])) diff --git a/src/nidm/experiment/tools/nidm_affinity_propagation.py b/src/nidm/experiment/tools/nidm_affinity_propagation.py index a76389e..7089350 100644 --- a/src/nidm/experiment/tools/nidm_affinity_propagation.py +++ b/src/nidm/experiment/tools/nidm_affinity_propagation.py @@ -1,5 +1,6 @@ import csv import os +import sys import tempfile import click import matplotlib.pyplot as plt @@ -83,11 +84,7 @@ def data_aggregation(): # all data from all the files is collected global full_model_variable_list full_model_variable_list = [] global model_list - model_list = v.split(",") - for i in range( - len(model_list) - ): # here, we remove any leading or trailing spaces - model_list[i] = model_list[i].strip() + model_list = [vv.strip() for vv in v.split(",")] global variables # used in dataparsing() variables = "" for i in range(len(model_list) - 1, -1, -1): @@ -122,18 +119,16 @@ def data_aggregation(): # all data from all the files is collected numcols = (len(data) - 1) // ( len(model_list) ) # Finds the number of columns in the original dataframe - global condensed_data # also used in linreg() condensed_data_holder[count] = [ [0] * (len(model_list)) ] # makes an array 1 row by the number of necessary columns for _ in range( numcols ): # makes the 2D array big enough to store all of the necessary values in the edited dataset - condensed_data_holder[count].append([0] * (len(model_list))) - for i in range( - len(model_list) - ): # stores the independent variable names in the first row - condensed_data_holder[count][0][i] = model_list[i] + condensed_data_holder[count].append([0] * len(model_list)) + for i, ml in enumerate(model_list): + # stores the independent variable names in the first row + condensed_data_holder[count][0][i] = ml numrows = 1 # begins at the first row to add data fieldcolumn = ( 0 # the column the variable name is in in the original dataset @@ -231,14 +226,10 @@ def data_aggregation(): # all data from all the files is collected if count1 > len(condensed_data_holder[count]) - 2: not_found_list.append(condensed_data_holder[count][0][i]) count1 = 0 - for i in range(len(condensed_data_holder[count][0])): - if " " in condensed_data_holder[count][0][i]: - condensed_data_holder[count][0][i] = condensed_data_holder[count][ - 0 - ][i].replace(" ", "_") - for i in range(len(variables)): - if " " in variables[i]: - variables[i] = variables[i].replace(" ", "_") + for i, cdh in enumerate(condensed_data_holder[count][0]): + condensed_data_holder[count][0][i] = cdh.replace(" ", "_") + for i, vrb in enumerate(variables): + variables[i] = vrb.replace(" ", "_") count = count + 1 if len(not_found_list) > 0: print("*" * 107) @@ -258,23 +249,22 @@ def data_aggregation(): # all data from all the files is collected + nidm_file + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables." ) - for i in range(0, len(not_found_list)): - print(str(i + 1) + ". " + not_found_list[i]) + for i, nf in enumerate(not_found_list): + print(f"{i+1}. {nf}") if o is not None: with open(o, "a") as f: - f.write(str(i + 1) + ". " + not_found_list[i]) - for j in range(len(not_found_list) - 1, 0, -1): - not_found_list.pop(j) - not_found_count = not_found_count + 1 + f.write(f"{i+1}. {nf}") + not_found_list.clear() + not_found_count += 1 print() if not_found_count > 0: - exit(1) + sys.exit(1) else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm query --help") - exit(1) + sys.exit(1) def dataparsing(): # The data is changed to a format that is usable by the linear regression method @@ -314,8 +304,8 @@ def dataparsing(): # The data is changed to a format that is usable by the line le = ( preprocessing.LabelEncoder() ) # anything involving le shows the encoding of categorical variables - for i in range(len(stringvars)): - le.fit(obj_df[stringvars[i]].astype(str)) + for sv in stringvars: + le.fit(obj_df[sv].astype(str)) obj_df_trf = obj_df.astype(str).apply( le.fit_transform ) # transforms the categorical variables into numbers. @@ -345,8 +335,7 @@ def ap(): for i in range(1, len(condensed_data)): if condensed_data[i][index] not in levels: levels.append(condensed_data[i][index]) - for i in range(len(levels)): - levels[i] = i + levels = list(range(len(levels))) # Beginning of the linear regression global X @@ -355,9 +344,9 @@ def ap(): scaler = MinMaxScaler() - for i in range(len(model_list)): - scaler.fit(df_final[[model_list[i]]]) - df_final[[model_list[i]]] = scaler.transform(df_final[[model_list[i]]]) + for ml in model_list: + scaler.fit(df_final[[ml]]) + df_final[[ml]] = scaler.transform(df_final[[ml]]) X = df_final[model_list] @@ -380,10 +369,7 @@ def ap(): sns.scatterplot(data=X, x=model_list[0], y=model_list[1], hue=af, palette="gnuplot") plt.xlabel(model_list[1]) plt.ylabel(model_list[0]) - title = "Clustering results of " - for i in range(len(model_list)): - title = title + model_list[i] + "," - title = title[0 : len(title) - 1] + title = "Clustering results of " + ",".join(model_list) plt.title(title) plt.show() if o is not None: diff --git a/src/nidm/experiment/tools/nidm_agglomerative_clustering.py b/src/nidm/experiment/tools/nidm_agglomerative_clustering.py index 60d705c..22b29a0 100644 --- a/src/nidm/experiment/tools/nidm_agglomerative_clustering.py +++ b/src/nidm/experiment/tools/nidm_agglomerative_clustering.py @@ -1,5 +1,6 @@ import csv import os +import sys import tempfile import click import matplotlib.pyplot as plt @@ -83,11 +84,7 @@ def data_aggregation(): # all data from all the files is collected global full_model_variable_list full_model_variable_list = [] global model_list - model_list = v.split(",") - for i in range( - len(model_list) - ): # here, we remove any leading or trailing spaces - model_list[i] = model_list[i].strip() + model_list = [vv.strip() for vv in v.split(",")] global variables # used in dataparsing() variables = "" for i in range(len(model_list) - 1, -1, -1): @@ -122,18 +119,16 @@ def data_aggregation(): # all data from all the files is collected numcols = (len(data) - 1) // ( len(model_list) ) # Finds the number of columns in the original dataframe - global condensed_data # also used in linreg() condensed_data_holder[count] = [ [0] * (len(model_list)) ] # makes an array 1 row by the number of necessary columns for _ in range( numcols ): # makes the 2D array big enough to store all of the necessary values in the edited dataset - condensed_data_holder[count].append([0] * (len(model_list))) - for i in range( - len(model_list) - ): # stores the independent variable names in the first row - condensed_data_holder[count][0][i] = model_list[i] + condensed_data_holder[count].append([0] * len(model_list)) + for i, ml in enumerate(model_list): + # stores the independent variable names in the first row + condensed_data_holder[count][0][i] = ml numrows = 1 # begins at the first row to add data fieldcolumn = ( 0 # the column the variable name is in in the original dataset @@ -231,15 +226,11 @@ def data_aggregation(): # all data from all the files is collected if count1 > len(condensed_data_holder[count]) - 2: not_found_list.append(condensed_data_holder[count][0][i]) count1 = 0 - for i in range(len(condensed_data_holder[count][0])): - if " " in condensed_data_holder[count][0][i]: - condensed_data_holder[count][0][i] = condensed_data_holder[count][ - 0 - ][i].replace(" ", "_") - for i in range(len(variables)): - if " " in variables[i]: - variables[i] = variables[i].replace(" ", "_") - count = count + 1 + for i, cdh in enumerate(condensed_data_holder[count][0]): + if " " in cdh: + condensed_data_holder[count][0][i] = cdh.replace(" ", "_") + variables = [v.replace(" ", "_") for v in variables] + count += 1 if len(not_found_list) > 0: print("*" * 107) print() @@ -258,23 +249,23 @@ def data_aggregation(): # all data from all the files is collected + nidm_file + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables." ) - for i in range(0, len(not_found_list)): - print(str(i + 1) + ". " + not_found_list[i]) + for i, nf in enumerate(not_found_list): + print(f"{i+1}. {nf}") if o is not None: with open(o, "a") as f: - f.write(str(i + 1) + ". " + not_found_list[i]) + f.write(f"{i+1}. {nf}") for j in range(len(not_found_list) - 1, 0, -1): not_found_list.pop(j) not_found_count = not_found_count + 1 print() if not_found_count > 0: - exit(1) + sys.exit(1) else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm query --help") - exit(1) + sys.exit(1) def dataparsing(): # The data is changed to a format that is usable by the linear regression method @@ -314,8 +305,8 @@ def dataparsing(): # The data is changed to a format that is usable by the line le = ( preprocessing.LabelEncoder() ) # anything involving le shows the encoding of categorical variables - for i in range(len(stringvars)): - le.fit(obj_df[stringvars[i]].astype(str)) + for sv in stringvars: + le.fit(obj_df[sv].astype(str)) obj_df_trf = obj_df.astype(str).apply( le.fit_transform ) # transforms the categorical variables into numbers. @@ -345,8 +336,7 @@ def ac(): for i in range(1, len(condensed_data)): if condensed_data[i][index] not in levels: levels.append(condensed_data[i][index]) - for i in range(len(levels)): - levels[i] = i + levels = list(range(len(levels))) # Beginning of the linear regression global X @@ -355,9 +345,9 @@ def ac(): scaler = MinMaxScaler() - for i in range(len(model_list)): - scaler.fit(df_final[[model_list[i]]]) - df_final[[model_list[i]]] = scaler.transform(df_final[[model_list[i]]]) + for ml in model_list: + scaler.fit(df_final[[ml]]) + df_final[[ml]] = scaler.transform(df_final[[ml]]) X = df_final[ model_list diff --git a/src/nidm/experiment/tools/nidm_gmm.py b/src/nidm/experiment/tools/nidm_gmm.py index eb3d598..da4ff1c 100644 --- a/src/nidm/experiment/tools/nidm_gmm.py +++ b/src/nidm/experiment/tools/nidm_gmm.py @@ -1,5 +1,6 @@ import csv import os +import sys import tempfile import click import matplotlib.pyplot as plt @@ -113,11 +114,7 @@ def data_aggregation(): # all data from all the files is collected global var_list # below, we edit the model so it splits by +,~, or =. However, to help it out in catching everything # we replaced ~ and = with a + so that we can still use split. Regex wasn't working. - var_list = v.split(",") - for i in range( - len(var_list) - ): # here, we remove any leading or trailing spaces - var_list[i] = var_list[i].strip() + var_list = [vv.strip() for vv in v.split(",")] # set the dependent variable to the one dependent variable in the model global variables # used in dataparsing() variables = "" @@ -156,7 +153,6 @@ def data_aggregation(): # all data from all the files is collected numcols = (len(data) - 1) // ( len(var_list) ) # Finds the number of columns in the original dataframe - global condensed_data # also used in linreg() condensed_data_holder[count] = [ [0] * (len(var_list)) ] # makes an array 1 row by the number of necessary columns @@ -164,14 +160,10 @@ def data_aggregation(): # all data from all the files is collected numcols ): # makes the 2D array big enough to store all of the necessary values in the edited dataset condensed_data_holder[count].append([0] * (len(var_list))) - for m in range(0, len(var_list)): - end_url = var_list[m].split("/") - if "/" in var_list[m]: - var_list[m] = end_url[len(end_url) - 1] - for i in range( - len(var_list) - ): # stores the independent variable names in the first row - condensed_data_holder[count][0][i] = var_list[i] + var_list = [v.split("/")[-1] for v in var_list] + for i, vr in enumerate(var_list): + # stores the independent variable names in the first row + condensed_data_holder[count][0][i] = vr numrows = 1 # begins at the first row to add data fieldcolumn = ( 0 # the column the variable name is in in the original dataset @@ -267,18 +259,11 @@ def data_aggregation(): # all data from all the files is collected if count1 > len(condensed_data_holder[count]) - 2: not_found_list.append(condensed_data_holder[count][0][i]) count1 = 0 - for i in range(len(condensed_data_holder[count][0])): - if " " in condensed_data_holder[count][0][i]: - condensed_data_holder[count][0][i] = condensed_data_holder[count][ - 0 - ][i].replace(" ", "_") - for i in range(len(var_list)): - if "/" in var_list[i]: - split = var_list[i].split("/") - var_list[i] = split[len(split) - 1] - if " " in var_list[i]: - var_list[i] = var_list[i].replace(" ", "_") - count = count + 1 + for i, cdh in enumerate(condensed_data_holder[count][0]): + if " " in cdh: + condensed_data_holder[count][0][i] = cdh.replace(" ", "_") + var_list = [vr.split("/")[-1].replace(" ", "_") for vr in var_list] + count += 1 if len(not_found_list) > 0: print("*" * 107) print() @@ -297,23 +282,22 @@ def data_aggregation(): # all data from all the files is collected + nidm_file + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables." ) - for i in range(0, len(not_found_list)): - print(str(i + 1) + ". " + not_found_list[i]) + for i, nf in enumerate(not_found_list): + print(f"{i+1}. {nf}") if o is not None: with open(o, "a") as f: - f.write(str(i + 1) + ". " + not_found_list[i]) - for j in range(len(not_found_list) - 1, 0, -1): - not_found_list.pop(j) - not_found_count = not_found_count + 1 + f.write(f"{i+1}. {nf}") + not_found_list.clear() + not_found_count += 1 print() if not_found_count > 0: - exit(1) + sys.exit(1) else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm k-means --help") - exit(1) + sys.exit(1) def dataparsing(): # The data is changed to a format that is usable by the linear regression method @@ -363,8 +347,8 @@ def dataparsing(): # The data is changed to a format that is usable by the line le = ( preprocessing.LabelEncoder() ) # anything involving le shows the encoding of categorical variables - for i in range(len(stringvars)): - le.fit(obj_df[stringvars[i]].astype(str)) + for sv in stringvars: + le.fit(obj_df[sv].astype(str)) obj_df_trf = obj_df.astype(str).apply( le.fit_transform ) # transforms the categorical variables into numbers. @@ -394,19 +378,18 @@ def cluster_number(): for i in range(1, len(condensed_data)): if condensed_data[i][index] not in levels: levels.append(condensed_data[i][index]) - for i in range(len(levels)): - levels[i] = i + levels = list(range(len(levels))) # Beginning of the linear regression global X # global y # Unsure on how to proceed here with interacting variables, since I'm sure dmatrices won't work - """scaler = MinMaxScaler() - - for i in range(len(model_list)): - scaler.fit(df_final[[model_list[i]]]) - df_final[[model_list[i]]] = scaler.transform(df_final[[model_list[i]]])""" + # scaler = MinMaxScaler() + # + # for i in range(len(model_list)): + # scaler.fit(df_final[[model_list[i]]]) + # df_final[[model_list[i]]] = scaler.transform(df_final[[model_list[i]]]) X = df_final[var_list] if "si" in cm.lower(): print("Sillhoute Score") @@ -420,10 +403,10 @@ def cluster_number(): ss.append(silhouette_avg) optimal_i = 0 distance_to_one = abs(1 - ss[0]) - for i in range(0, len(ss)): - if abs(1 - ss[i]) <= distance_to_one: + for i, s in enumerate(ss): + if abs(1 - s) <= distance_to_one: optimal_i = i - distance_to_one = abs(1 - ss[i]) + distance_to_one = abs(1 - s) n_clusters = optimal_i + 2 print( @@ -454,38 +437,39 @@ def cluster_number(): print( "Optimal number of clusters: " + str(n_clusters) ) # optimal number of clusters, minimizing aic - """min_aic = aic[0] - max_aic = aic[0] - max_i = 0 - min_i = 0 - for i in range(1, len(aic)): - if aic[i] >= max_aic: - max_aic = aic[i] - max_i = i - elif aic[i] <= min_aic: - min_aic = aic[i] - min_i = i - p1 = np.array([min_i, aic[min_i]]) - p2 = np.array([max_i, aic[max_i]]) - # the way I am doing the method is as follows: - # the different sse values form a curve like an L (like an exponential decay) - # The elbow is the point furthest from a line connecting max and min - # So I am calculating the distance, and the maximum distance from point to curve shows the optimal point - # AKA the number of clusters - dist = [] - for n in range(0, len(aic)): - norm = np.linalg.norm - p3 = np.array([n, aic[n]]) - dist.append(np.abs(norm(np.cross(p2 - p1, p1 - p3))) / norm(p2 - p1)) - max_dist = dist[0] - n_clusters = 2 - for x in range(1, len(dist)): - if dist[x] >= max_dist: - max_dist = dist[x] - n_clusters = x + 2 - plt.plot(aic) - plt.show()""" + # min_aic = aic[0] + # max_aic = aic[0] + # max_i = 0 + # min_i = 0 + # for i in range(1, len(aic)): + # if aic[i] >= max_aic: + # max_aic = aic[i] + # max_i = i + # elif aic[i] <= min_aic: + # min_aic = aic[i] + # min_i = i + # p1 = np.array([min_i, aic[min_i]]) + # p2 = np.array([max_i, aic[max_i]]) + # # the way I am doing the method is as follows: + # # the different sse values form a curve like an L (like an exponential decay) + # # The elbow is the point furthest from a line connecting max and min + # # So I am calculating the distance, and the maximum distance from point to curve shows the optimal point + # # AKA the number of clusters + # dist = [] + # for n in range(0, len(aic)): + # norm = np.linalg.norm + # p3 = np.array([n, aic[n]]) + # dist.append(np.abs(norm(np.cross(p2 - p1, p1 - p3))) / norm(p2 - p1)) + # max_dist = dist[0] + # n_clusters = 2 + # for x in range(1, len(dist)): + # if dist[x] >= max_dist: + # max_dist = dist[x] + # n_clusters = x + 2 + # + # plt.plot(aic) + # plt.show() gmm = GaussianMixture(n_components=n_clusters).fit(X) labels = gmm.fit(X).predict(X) @@ -509,43 +493,41 @@ def cluster_number(): min_bic = bic[i] min_i = i n_clusters = min_i + 2 - """min_bic = bic[0] - max_bic = bic[0] - max_i = 0 - min_i = 0 - for i in range(1,len(bic)): - if bic[i]>=max_bic: - max_bic = bic[i] - max_i = i - elif bic[i]<= min_bic: - min_bic = bic[i] - min_i = i - p1 = np.array([min_i, bic[min_i]]) - p2 = np.array([max_i, bic[max_i]]) - # the way I am doing the method is as follows: - # the different sse values form a curve like an L (like an exponential decay) - # The elbow is the point furthest from a line connecting max and min - # So I am calculating the distance, and the maximum distance from point to curve shows the optimal point - # AKA the number of clusters - dist = [] - for n in range(0, len(bic)): - norm = np.linalg.norm - p3 = np.array([n, bic[n]]) - dist.append(np.abs(norm(np.cross(p2 - p1, p1 - p3))) / norm(p2 - p1)) - max_dist = dist[0] - n_clusters = 2 - for x in range(1, len(dist)): - if dist[x] >= max_dist: - max_dist = dist[x] - n_clusters = x + 2 - plt.plot(bic) - plt.show()""" - print( - "Optimal number of clusters: " + str(n_clusters) - ) # optimal number of clusters + # min_bic = bic[0] + # max_bic = bic[0] + # max_i = 0 + # min_i = 0 + # for i in range(1,len(bic)): + # if bic[i]>=max_bic: + # max_bic = bic[i] + # max_i = i + # elif bic[i]<= min_bic: + # min_bic = bic[i] + # min_i = i + # p1 = np.array([min_i, bic[min_i]]) + # p2 = np.array([max_i, bic[max_i]]) + # # the way I am doing the method is as follows: + # # the different sse values form a curve like an L (like an exponential decay) + # # The elbow is the point furthest from a line connecting max and min + # # So I am calculating the distance, and the maximum distance from point to curve shows the optimal point + # # AKA the number of clusters + # dist = [] + # for n in range(0, len(bic)): + # norm = np.linalg.norm + # p3 = np.array([n, bic[n]]) + # dist.append(np.abs(norm(np.cross(p2 - p1, p1 - p3))) / norm(p2 - p1)) + # max_dist = dist[0] + # n_clusters = 2 + # for x in range(1, len(dist)): + # if dist[x] >= max_dist: + # max_dist = dist[x] + # n_clusters = x + 2 + # plt.plot(bic) + # plt.show() + print("Optimal number of clusters:", n_clusters) gmm = GaussianMixture(n_components=n_clusters).fit(X) labels = gmm.fit(X).predict(X) - ax = None or plt.gca() + ax = plt.gca() X = df_final[var_list].to_numpy() ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap="viridis", zorder=2) ax.axis("equal") diff --git a/src/nidm/experiment/tools/nidm_kmeans.py b/src/nidm/experiment/tools/nidm_kmeans.py index 09c8bbc..219977b 100644 --- a/src/nidm/experiment/tools/nidm_kmeans.py +++ b/src/nidm/experiment/tools/nidm_kmeans.py @@ -1,5 +1,6 @@ import csv import os +import sys import tempfile import warnings import click @@ -120,11 +121,7 @@ def data_aggregation(): # all data from all the files is collected global var_list # below, we edit the model so it splits by +,~, or =. However, to help it out in catching everything # we replaced ~ and = with a + so that we can still use split. Regex wasn't working. - var_list = v.split(",") - for i in range( - len(var_list) - ): # here, we remove any leading or trailing spaces - var_list[i] = var_list[i].strip() + var_list = [vv.strip() for vv in v.split(",")] # set the dependent variable to the one dependent variable in the model global variables # used in dataparsing() variables = "" @@ -162,7 +159,6 @@ def data_aggregation(): # all data from all the files is collected numcols = (len(data) - 1) // ( len(var_list) ) # Finds the number of columns in the original dataframe - global condensed_data # also used in linreg() condensed_data_holder[count] = [ [0] * (len(var_list)) ] # makes an array 1 row by the number of necessary columns @@ -170,14 +166,10 @@ def data_aggregation(): # all data from all the files is collected numcols ): # makes the 2D array big enough to store all of the necessary values in the edited dataset condensed_data_holder[count].append([0] * (len(var_list))) - for m in range(0, len(var_list)): - end_url = var_list[m].split("/") - if "/" in var_list[m]: - var_list[m] = end_url[len(end_url) - 1] - for i in range( - len(var_list) - ): # stores the independent variable names in the first row - condensed_data_holder[count][0][i] = var_list[i] + var_list = [vr.split("/")[-1] for vr in var_list] + for i, vr in enumerate(var_list): + # stores the independent variable names in the first row + condensed_data_holder[count][0][i] = vr numrows = 1 # begins at the first row to add data fieldcolumn = ( 0 # the column the variable name is in in the original dataset @@ -273,18 +265,11 @@ def data_aggregation(): # all data from all the files is collected if count1 > len(condensed_data_holder[count]) - 2: not_found_list.append(condensed_data_holder[count][0][i]) count1 = 0 - for i in range(len(condensed_data_holder[count][0])): - if " " in condensed_data_holder[count][0][i]: - condensed_data_holder[count][0][i] = condensed_data_holder[count][ - 0 - ][i].replace(" ", "_") - for i in range(len(var_list)): - if "/" in var_list[i]: - split = var_list[i].split("/") - var_list[i] = split[len(split) - 1] - if " " in var_list[i]: - var_list[i] = var_list[i].replace(" ", "_") - count = count + 1 + for i, cdh in enumerate(condensed_data_holder[count][0]): + if " " in cdh: + condensed_data_holder[count][0][i] = cdh.replace(" ", "_") + var_list = [vr.split("/")[-1].replace(" ", "_") for vr in var_list] + count += 1 if len(not_found_list) > 0: print("*" * 107) print() @@ -303,23 +288,23 @@ def data_aggregation(): # all data from all the files is collected + nidm_file + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables." ) - for i in range(0, len(not_found_list)): - print(str(i + 1) + ". " + not_found_list[i]) + for i, nf in enumerate(not_found_list): + print(f"{i+1}. {nf}") if o is not None: with open(o, "a") as f: - f.write(str(i + 1) + ". " + not_found_list[i]) + f.write(f"{i+1}. {nf}") for j in range(len(not_found_list) - 1, 0, -1): not_found_list.pop(j) not_found_count = not_found_count + 1 print() if not_found_count > 0: - exit(1) + sys.exit(1) else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm k-means --help") - exit(1) + sys.exit(1) def dataparsing(): # The data is changed to a format that is usable by the linear regression method @@ -369,8 +354,8 @@ def dataparsing(): # The data is changed to a format that is usable by the line le = ( preprocessing.LabelEncoder() ) # anything involving le shows the encoding of categorical variables - for i in range(len(stringvars)): - le.fit(obj_df[stringvars[i]].astype(str)) + for sv in stringvars: + le.fit(obj_df[sv].astype(str)) obj_df_trf = obj_df.astype(str).apply( le.fit_transform ) # transforms the categorical variables into numbers. @@ -400,19 +385,18 @@ def cluster_number(): for i in range(1, len(condensed_data)): if condensed_data[i][index] not in levels: levels.append(condensed_data[i][index]) - for i in range(len(levels)): - levels[i] = i + levels = list(range(len(levels))) # Beginning of the linear regression global X # global y # Unsure on how to proceed here with interacting variables, since I'm sure dmatrices won't work - """scaler = MinMaxScaler() - - for i in range(len(model_list)): - scaler.fit(df_final[[model_list[i]]]) - df_final[[model_list[i]]] = scaler.transform(df_final[[model_list[i]]])""" + # scaler = MinMaxScaler() + # + # for i in range(len(model_list)): + # scaler.fit(df_final[[model_list[i]]]) + # df_final[[model_list[i]]] = scaler.transform(df_final[[model_list[i]]]) X = df_final[var_list] if "ga" in cm.lower(): print("\n\nGap Statistic") @@ -487,9 +471,9 @@ def cluster_number(): # So I am calculating the distance, and the maximum distance from point to curve shows the optimal point # AKA the number of clusters dist = [] - for n in range(0, len(sse)): + for n, s in enumerate(sse): norm = np.linalg.norm - p3 = np.array([n, sse[n]]) + p3 = np.array([n, s]) dist.append(np.abs(norm(np.cross(p2 - p1, p1 - p3))) / norm(p2 - p1)) max_dist = dist[0] optimal_cluster = 2 @@ -526,10 +510,10 @@ def cluster_number(): ss.append(silhouette_avg) optimal_i = 0 distance_to_one = abs(1 - ss[0]) - for i in range(0, len(ss)): - if abs(1 - ss[i]) <= distance_to_one: + for i, s in enumerate(ss): + if abs(1 - s) <= distance_to_one: optimal_i = i - distance_to_one = abs(1 - ss[i]) + distance_to_one = abs(1 - s) n_clusters = optimal_i + 2 print( "Optimal number of clusters: " + str(n_clusters) @@ -639,8 +623,8 @@ def cluster_number(): # ask for help: how does one do a dendrogram, also without graphing? if o is not None: - f = open(o, "a") - f.close() + with open(o, "a"): + pass def opencsv(data): diff --git a/src/nidm/experiment/tools/nidm_linreg.py b/src/nidm/experiment/tools/nidm_linreg.py index d44526b..7f0be6c 100644 --- a/src/nidm/experiment/tools/nidm_linreg.py +++ b/src/nidm/experiment/tools/nidm_linreg.py @@ -3,6 +3,7 @@ import csv import os from statistics import mean +import sys import tempfile import warnings import click @@ -130,11 +131,7 @@ def data_aggregation(): # all data from all the files is collected plus_replace = m.replace("=", "+") elif "," in m: plus_replace = m.replace(",", "+") - model_list = plus_replace.split("+") - for i in range( - len(model_list) - ): # here, we remove any leading or trailing spaces - model_list[i] = model_list[i].strip() + model_list = [v.strip() for v in plus_replace.split("+")] full_model_variable_list = [] # set the dependent variable to the one dependent variable in the model global dep_var # used in dataparsing(), linreg(), and contrasting() @@ -170,7 +167,7 @@ def data_aggregation(): # all data from all the files is collected + dep_var + '" from either the right or the left side of the equation.' ) - exit(1) + sys.exit(1) else: ind_vars = ind_vars + model_list[i] + "," ind_vars = ind_vars[0 : len(ind_vars) - 1] @@ -203,7 +200,6 @@ def data_aggregation(): # all data from all the files is collected numcols = (len(data) - 1) // ( len(independentvariables) + 1 ) # Finds the number of columns in the original dataframe - global condensed_data # also used in linreg() condensed_data_holder[count] = [ [0] * (len(independentvariables) + 1) ] # makes an array 1 row by the number of necessary columns @@ -213,10 +209,9 @@ def data_aggregation(): # all data from all the files is collected condensed_data_holder[count].append( [0] * (len(independentvariables) + 1) ) - for i in range( - len(independentvariables) - ): # stores the independent variable names in the first row - condensed_data_holder[count][0][i] = independentvariables[i] + for i, v in enumerate(independentvariables): + # stores the independent variable names in the first row + condensed_data_holder[count][0][i] = v condensed_data_holder[count][0][-1] = str( dep_var ) # stores the dependent variable name in the first row @@ -254,11 +249,9 @@ def data_aggregation(): # all data from all the files is collected ): # column, so it can append the values under the proper variables try: split_url = condensed_data_holder[count][0][i].split("/") - for k in range(0, len(full_model_variable_list)): - if "/" in full_model_variable_list[k]: - full_model_variable_list[k] = split_url[ - len(split_url) - 1 - ] + for k, fmv in enumerate(full_model_variable_list): + if "/" in fmv: + full_model_variable_list[k] = split_url[-1] if ( data[j][fieldcolumn] == condensed_data_holder[count][0][i] ): # in the dataframe, the name is in column 3 @@ -330,20 +323,13 @@ def data_aggregation(): # all data from all the files is collected if count1 > len(condensed_data_holder[count]) - 2: not_found_list.append(condensed_data_holder[count][0][i]) count1 = 0 - for i in range(len(condensed_data_holder[count][0])): - if " " in condensed_data_holder[count][0][i]: - condensed_data_holder[count][0][i] = condensed_data_holder[count][ - 0 - ][i].replace(" ", "_") - for i in range(len(independentvariables)): - if "/" in independentvariables[i]: - split = independentvariables[i].split("/") - independentvariables[i] = split[len(split) - 1] - if " " in independentvariables[i]: - independentvariables[i] = independentvariables[i].replace(" ", "_") - if " " in dep_var: - dep_var = dep_var.replace(" ", "_") - count = count + 1 + for i, cdh in enumerate(condensed_data_holder[count][0]): + condensed_data_holder[count][0][i] = cdh.replace(" ", "_") + independentvariables = [ + v.split("/")[-1].replace(" ", "_") for v in independentvariables + ] + dep_var = dep_var.replace(" ", "_") + count += 1 if len(not_found_list) > 0: print("*" * 107) print() @@ -362,23 +348,22 @@ def data_aggregation(): # all data from all the files is collected + nidm_file + ". The model cannot run because this will skew the data. Try checking your spelling or use nidm_query.py to see other possible variables." ) - for i in range(0, len(not_found_list)): - print(str(i + 1) + ". " + not_found_list[i]) + for i, nf in enumerate(not_found_list): + print(f"{i+1}. {nf}") if o is not None: with open(o, "a") as f: - f.write(str(i + 1) + ". " + not_found_list[i]) - for j in range(len(not_found_list) - 1, 0, -1): - not_found_list.pop(j) - not_found_count = not_found_count + 1 + f.write(f"{i+1}. {nf}") + not_found_list.clear() + not_found_count += 1 print() if not_found_count > 0: - exit(1) + sys.exit(1) else: print("ERROR: No query parameter provided. See help:") print() os.system("pynidm linreg --help") - exit(1) + sys.exit(1) def dataparsing(): # The data is changed to a format that is usable by the linear regression method @@ -393,10 +378,13 @@ def dataparsing(): # The data is changed to a format that is usable by the line split = condensed_data[0][i].split("/") condensed_data[0][i] = split[len(split) - 1] - """In this section, if there are less than 20 points, the model will be inaccurate and there are too few variables for regularization. - That means that we warn the user that such errors can occur and ask them if they want to proceed. - The answer is stored in answer. If the user responds with N, it exits the code after writing the error to the output file (if there is one). - If the user says Y instead, the code runs, but stops before doing the regularization.""" + # In this section, if there are less than 20 points, the model will be + # inaccurate and there are too few variables for regularization. That + # means that we warn the user that such errors can occur and ask them if + # they want to proceed. The answer is stored in answer. If the user + # responds with N, it exits the code after writing the error to the output + # file (if there is one). If the user says Y instead, the code runs, but + # stops before doing the regularization. global answer answer = "?" if (len(condensed_data) - 1) < 20: @@ -420,7 +408,7 @@ def dataparsing(): # The data is changed to a format that is usable by the line f.write( "Due to a lack of data (<20 points), you stopped the model because the results may have been inaccurate." ) - exit(1) + sys.exit(1) x = pd.read_csv( opencsv(condensed_data) ) # changes the dataframe to a csv to make it easier to work with @@ -453,8 +441,8 @@ def dataparsing(): # The data is changed to a format that is usable by the line le = ( preprocessing.LabelEncoder() ) # anything involving le shows the encoding of categorical variables - for i in range(len(variables)): - le.fit(obj_df[variables[i]].astype(str)) + for v in variables: + le.fit(obj_df[v].astype(str)) obj_df_trf = obj_df.astype(str).apply( le.fit_transform ) # transforms the categorical variables into numbers. @@ -483,8 +471,8 @@ def linreg(): # actual linear regression model_string = [] model_string.append(dep_var) model_string.append(" ~ ") - for i in range(0, len(full_model_variable_list)): - model_string.append(full_model_variable_list[i]) + for fmv in full_model_variable_list: + model_string.append(fmv) model_string.append(" + ") model_string.pop(-1) global full_model @@ -502,8 +490,7 @@ def linreg(): # actual linear regression for i in range(1, len(condensed_data)): if condensed_data[i][index] not in levels: levels.append(condensed_data[i][index]) - for i in range(len(levels)): - levels[i] = i + levels = list(range(len(levels))) # Beginning of the linear regression global X @@ -513,13 +500,13 @@ def linreg(): # actual linear regression model_string = [] model_string.append(dep_var) model_string.append(" ~ ") - for i in range(0, len(full_model_variable_list)): - model_string.append(full_model_variable_list[i]) + for fmv in full_model_variable_list: + model_string.append(fmv) model_string.append(" + ") model_string.pop(-1) - for i in range(0, len(model_string)): - if "*" in model_string[i]: - replacement = model_string[i].split("*") + for i, mdl in enumerate(model_string): + if "*" in mdl: + replacement = mdl.split("*") model_string[i] = replacement[0] + ":" + replacement[1] # makes sure the model is in the right format. string = "".join(model_string) @@ -549,39 +536,28 @@ def linreg(): # actual linear regression def contrasting(): global c + global full_model_variable_list if c: # to account for multiple contrast variables contrastvars = [] if "," in c: contrastvars = c.split(",") - for i in range(len(contrastvars)): - contrastvars[i] = contrastvars[i].strip() - if " " in contrastvars[i]: - contrastvars[i] = contrastvars[i].replace(" ", "_") - if "/" in contrastvars[i]: # to account for URLs - split = contrastvars[i].split("/") - contrastvars[i] = split[len(split) - 1] - else: - split = c.split("/") # to account for URLs - c = split[len(split) - 1] - + contrastvars = [ + v.strip().replace(" ", "_").split("/")[-1] for v in contrastvars + ] + c = c.split("/")[-1] # to account for URLs ind_vars_no_contrast_var = "" index = 1 - for i in range(len(full_model_variable_list)): - if "/" in full_model_variable_list[i]: - split = full_model_variable_list[i].split("/") - full_model_variable_list[i] = split[len(split) - 1] - if " " in full_model_variable_list[i]: - full_model_variable_list[i] = full_model_variable_list[i].replace( - " ", "_" - ) + full_model_variable_list = [ + v.split("/")[-1].replace(" ", "_") for v in full_model_variable_list + ] for var in full_model_variable_list: - if var != c and not (var in contrastvars): + if var != c and var not in contrastvars: if index == 1: ind_vars_no_contrast_var = var index += 1 else: - ind_vars_no_contrast_var = ind_vars_no_contrast_var + " + " + var + ind_vars_no_contrast_var += " + " + var if len(contrastvars) > 0: contraststring = " + ".join(contrastvars) else: @@ -734,9 +710,8 @@ def code_without_intercept(self, levels): def regularizing(): - if r == ("L1" or "Lasso" or "l1" or "lasso") and not ( - "y" in answer.lower() - ): # does it say L1, and has the user chosen to go ahead with running the code? + # does it say L1, and has the user chosen to go ahead with running the code? + if r in ("L1", "Lasso", "l1", "lasso") and "y" not in answer.lower(): # Loop to compute the cross-validation scores max_cross_val_alpha = 1 max_cross_val_score = ( @@ -783,9 +758,8 @@ def regularizing(): f.write(f"\nIntercept: {lassoModelChosen.intercept_}") print() - if r == ("L2" or "Ridge" or "l2" or "Ridge") and not ( - "y" in answer.lower() - ): # does it say L2, and has the user chosen to go ahead with running the code? + # does it say L2, and has the user chosen to go ahead with running the code? + if r in ("L2", "Ridge", "l2", "ridge") and "y" not in answer.lower(): # Loop to compute the different values of cross-validation scores max_cross_val_alpha = 1 max_cross_val_score = ( @@ -810,9 +784,10 @@ def regularizing(): ) print(f"Current Model Score = {ridgeModelChosen.score(X, y)}") index = 0 - """This numpy_conversion part was necessary because for the ridge model, all the coefficients get stored in a - numpy array, and the conversion is necessary to get the coefficients. However, it is only needed if the model - has interacting variables.""" + # This numpy_conversion part was necessary because for the ridge model, + # all the coefficients get stored in a numpy array, and the conversion + # is necessary to get the coefficients. However, it is only needed if + # the model has interacting variables. numpy_conversion = False for var in full_model_variable_list: if ("*" in var) or (":" in var): diff --git a/src/nidm/experiment/tools/nidm_query.py b/src/nidm/experiment/tools/nidm_query.py index a282e08..9f94919 100644 --- a/src/nidm/experiment/tools/nidm_query.py +++ b/src/nidm/experiment/tools/nidm_query.py @@ -2,6 +2,7 @@ from json import dumps import os +import sys import click from click_option_group import RequiredMutuallyExclusiveOptionGroup, optgroup import pandas as pd @@ -276,7 +277,7 @@ def query( print("ERROR: No query parameter provided. See help:") print() os.system("pynidm query --help") - exit(1) + sys.exit(1) # it can be used calling the script `python nidm_query.py -nl ... -q .. diff --git a/src/nidm/experiment/tools/repronim_simple2_brainvolumes.py b/src/nidm/experiment/tools/repronim_simple2_brainvolumes.py index 1a740df..b3698e4 100644 --- a/src/nidm/experiment/tools/repronim_simple2_brainvolumes.py +++ b/src/nidm/experiment/tools/repronim_simple2_brainvolumes.py @@ -77,7 +77,7 @@ def add_brainvolume_data( # store participant id for later use in processing the data for this row participant_id = row_data # if there is no agent for the participant then add one - if row_data not in participant_agent.keys(): + if row_data not in participant_agent: # add an agent for this person participant_agent[row_data] = nidmdoc.graph.agent( QualifiedName( @@ -95,7 +95,7 @@ def add_brainvolume_data( ) # see if we already have a software_activity for this agent - if software_key not in software_activity.keys(): + if software_key not in software_activity: # create an activity for the computation...simply a placeholder for more extensive provenance software_activity[software_key] = nidmdoc.graph.activity( QualifiedName( @@ -128,7 +128,7 @@ def add_brainvolume_data( ) # check if there's an associated software agent and if not, create one - if software_key not in software_agent.keys(): + if software_key not in software_agent: # create an agent software_agent[software_key] = nidmdoc.graph.agent( QualifiedName( @@ -189,7 +189,7 @@ def add_brainvolume_data( if ( software_activity[software_key].identifier.localpart + participant_agent[participant_id].identifier.localpart - not in entity.keys() + not in entity ): # create an entity to store brain volume data for this participant entity[ @@ -276,7 +276,7 @@ def add_brainvolume_data( # if there was data about this subject in the NIDM file already (i.e. an agent already exists with this subject id) # then add this brain volumes data to NIDM file, else skip it.... - if not (len(csv_row.index) == 0): + if len(csv_row.index) != 0: print(f"found other data for participant {row[1]}") # Here we're sure we have an agent in the NIDM graph that corresponds to the participant in the @@ -309,7 +309,7 @@ def add_brainvolume_data( ) # see if we already have a software_activity for this agent - if software_key + row[2] not in software_activity.keys(): + if software_key + row[2] not in software_activity: # create an activity for the computation...simply a placeholder for more extensive provenance software_activity[ software_key + row[2] @@ -349,7 +349,7 @@ def add_brainvolume_data( ) # check if there's an associated software agent and if not, create one - if software_key not in software_agent.keys(): + if software_key not in software_agent: # if we have a URL defined for this software in Constants.py then use it else simply use the string name of the software product if software_key.lower() in Constants.namespaces: # create an agent @@ -457,7 +457,7 @@ def add_brainvolume_data( software_key + row[2] ].identifier.localpart + row[2] - not in entity.keys() + not in entity ): # create an entity to store brain volume data for this participant entity[ diff --git a/src/nidm/experiment/tools/rest.py b/src/nidm/experiment/tools/rest.py index 9627bd3..e925abe 100644 --- a/src/nidm/experiment/tools/rest.py +++ b/src/nidm/experiment/tools/rest.py @@ -11,9 +11,7 @@ from tabulate import tabulate from nidm.core import Constants from nidm.experiment import Navigate, Query -import nidm.experiment.Navigate from nidm.experiment.Utils import validate_uuid -import nidm.experiment.tools.rest_statistics def convertListtoDict(lst): @@ -491,20 +489,19 @@ def ExpandProjectMetaData(self, meta_data): for acq in Navigate.getAcquisitions(self.nidm_files, session): act_data = Navigate.getActivityData(self.nidm_files, acq) for de in act_data.data: - if ( - de.isAbout == "http://uri.interlex.org/ilx_0100400" - or de.isAbout == "http://uri.interlex.org/base/ilx_0100400" + if de.isAbout in ( + "http://uri.interlex.org/ilx_0100400", + "http://uri.interlex.org/base/ilx_0100400", ): - if de.value == "n/a" or de.value == "nan": + if de.value in ("n/a", "nan"): ages.add(float("nan")) else: ages.add(float(de.value)) - elif ( - de.isAbout == "http://uri.interlex.org/ilx_0101292" - or de.isAbout == "http://uri.interlex.org/base/ilx_0101292" - or de.isAbout == "http://uri.interlex.org/ilx_0738439" - or de.isAbout - == "https://ndar.nih.gov/api/datadictionary/v2/dataelement/gender" + elif de.isAbout in ( + "http://uri.interlex.org/ilx_0101292", + "http://uri.interlex.org/base/ilx_0101292", + "http://uri.interlex.org/ilx_0738439", + "https://ndar.nih.gov/api/datadictionary/v2/dataelement/gender", ): genders.add(de.value) elif ( @@ -525,7 +522,7 @@ def ExpandProjectMetaData(self, meta_data): project[str(Constants.NIDM_HANDEDNESS)] = list(hands) def projectStats(self): - result = dict() + result = {} subjects = None path = (urlparse(self.command)).path @@ -541,7 +538,7 @@ def projectStats(self): self.restLog("comparng " + str(pid) + " with " + str(id_), 5) self.restLog("comparng " + str(pid) + " with " + Constants.NIIRI + id_, 5) self.restLog("comparng " + str(pid) + " with niiri:" + id_, 5) - if pid == id_ or pid == Constants.NIIRI + id_ or pid == "niiri:" + id_: + if pid in (id_, Constants.NIIRI + id_, "niiri:" + id_): # strip off prefixes to make it more human readable for key in projects["projects"][pid]: short_key = key @@ -602,17 +599,17 @@ def addFieldStats(self, result, project, subjects, field, type): # noqa: A002 data = Query.GetParticipantInstrumentData( tuple(self.nidm_files), project, s ) - for i in data: - if field in data[i]: - values.append(float(data[i][field])) + for v in data.values(): + if field in v: + values.append(float(v[field])) # derivatives are of the form [UUID]['values'][URI]{datumType, label, values, units} if type == self.STAT_TYPE_DERIVATIVES: data = Query.GetDerivativesDataForSubject( tuple(self.nidm_files), project, s ) - for deriv in data: - for URI in data[deriv]["values"]: - measures = data[deriv]["values"][URI] + for deriv_value in data.values(): + for URI in deriv_value["values"]: + measures = deriv_value["values"][URI] if field == measures["label"] or field == self.getTailOfURI( URI ): @@ -639,9 +636,7 @@ def projectSummary(self): pid = parse.unquote(str(match.group(1))) self.restLog(f"Returning project {pid} summary", 2) - result = nidm.experiment.Navigate.GetProjectAttributes( - self.nidm_files, project_id=pid - ) + result = Navigate.GetProjectAttributes(self.nidm_files, project_id=pid) result["subjects"] = Query.GetParticipantUUIDsForProject( self.nidm_files, project_id=pid, filter=self.query["filter"] ) @@ -772,7 +767,7 @@ def subjects(self): # print ("getting info for " + str(s)) x = self.getFieldInfoForSubject(proj, s) - if x != {}: + if x: result["fields"][Query.URITail(s)] = x return self.subjectFormat(result) diff --git a/tests/experiment/test_experiment_basic.py b/tests/experiment/test_experiment_basic.py index 0c321d8..c568341 100644 --- a/tests/experiment/test_experiment_basic.py +++ b/tests/experiment/test_experiment_basic.py @@ -28,7 +28,7 @@ def test_2(tmp_path: Path) -> None: def test_sessions_1() -> None: project = Project() - assert project.sessions == [] + assert not project.sessions session1 = Session(project) project.add_sessions(session1) @@ -42,7 +42,7 @@ def test_sessions_1() -> None: def test_sessions_2() -> None: project = Project() - assert project.sessions == [] + assert not project.sessions session1 = Session(project) assert project.sessions[0].label == session1.label @@ -161,7 +161,7 @@ def test_session_noparameters(): assert issubclass(type(proj.bundle), prov.model.ProvDocument) # checking if one session is added - assert len(proj.sessions) + assert len(proj.sessions) != 0 # checking graph namespace const_l = list(Constants.namespaces) diff --git a/tests/experiment/test_map_vars_to_terms.py b/tests/experiment/test_map_vars_to_terms.py index bd1c921..3a1c5bd 100644 --- a/tests/experiment/test_map_vars_to_terms.py +++ b/tests/experiment/test_map_vars_to_terms.py @@ -159,9 +159,9 @@ def test_map_vars_to_terms_BIDS(setup: Setup, tmp_path: Path) -> None: # check whether JSON mapping structure returned from map_variables_to_terms matches the # reproshema structure - assert "DD(source='test', variable='age')" in column_to_terms.keys() - assert "DD(source='test', variable='sex')" in column_to_terms.keys() - assert "isAbout" in column_to_terms["DD(source='test', variable='age')"].keys() + assert "DD(source='test', variable='age')" in column_to_terms + assert "DD(source='test', variable='sex')" in column_to_terms + assert "isAbout" in column_to_terms["DD(source='test', variable='age')"] assert ( "http://uri.interlex.org/ilx_0100400" == column_to_terms["DD(source='test', variable='age')"]["isAbout"][0]["@id"] @@ -260,9 +260,9 @@ def test_map_vars_to_terms_reproschema(setup: Setup, tmp_path: Path) -> None: # check whether JSON mapping structure returned from map_variables_to_terms matches the # reproshema structure - assert "DD(source='test', variable='age')" in column_to_terms.keys() - assert "DD(source='test', variable='sex')" in column_to_terms.keys() - assert "isAbout" in column_to_terms["DD(source='test', variable='age')"].keys() + assert "DD(source='test', variable='age')" in column_to_terms + assert "DD(source='test', variable='sex')" in column_to_terms + assert "isAbout" in column_to_terms["DD(source='test', variable='age')"] assert ( "http://uri.interlex.org/ilx_0100400" == column_to_terms["DD(source='test', variable='age')"]["isAbout"][0]["@id"] @@ -309,9 +309,9 @@ def test_map_vars_to_terms_reproschema(setup: Setup, tmp_path: Path) -> None: with open(tmp_path / "nidm_annotations_annotations.json") as fp: json.load(fp) - assert "DD(source='test', variable='age')" in column_to_terms.keys() - assert "DD(source='test', variable='sex')" in column_to_terms.keys() - assert "isAbout" in column_to_terms["DD(source='test', variable='age')"].keys() + assert "DD(source='test', variable='age')" in column_to_terms + assert "DD(source='test', variable='sex')" in column_to_terms + assert "isAbout" in column_to_terms["DD(source='test', variable='age')"] assert ( "http://uri.interlex.org/ilx_0100400" == column_to_terms["DD(source='test', variable='age')"]["isAbout"][0]["@id"] diff --git a/tests/experiment/test_query.py b/tests/experiment/test_query.py index 4899f53..1a91bff 100644 --- a/tests/experiment/test_query.py +++ b/tests/experiment/test_query.py @@ -33,7 +33,7 @@ def abide(brain_vol_files) -> ProjectData: for p in projects: proj_info = nidm.experiment.Navigate.GetProjectAttributes(files, p) if ( - "dctypes:title" in proj_info.keys() + "dctypes:title" in proj_info and proj_info["dctypes:title"] == "ABIDE - CMU_a" ): cmu_test_project_uuid = p @@ -105,10 +105,10 @@ def test_GetParticipantIDs(tmp_path: Path) -> None: acq = Acquisition(uuid="_15793", session=session) acq2 = Acquisition(uuid="_15795", session=session) - person = acq.add_person(attributes=({Constants.NIDM_SUBJECTID: "9999"})) + person = acq.add_person(attributes={Constants.NIDM_SUBJECTID: "9999"}) acq.add_qualified_association(person=person, role=Constants.NIDM_PARTICIPANT) - person2 = acq2.add_person(attributes=({Constants.NIDM_SUBJECTID: "8888"})) + person2 = acq2.add_person(attributes={Constants.NIDM_SUBJECTID: "8888"}) acq2.add_qualified_association(person=person2, role=Constants.NIDM_PARTICIPANT) # save a turtle file @@ -287,7 +287,7 @@ def test_GetProjectsMetadata(abide: ProjectData, tmp_path: Path) -> None: # find the project ID from the CMU file p3 = None for project_id in parsed["projects"]: - if project_id != p1 and project_id != p2: + if project_id not in (p1, p2): if ( parsed["projects"][project_id][str(Constants.NIDM_PROJECT_NAME)] == "ABIDE - CMU_a" @@ -331,10 +331,10 @@ def test_GetProjectAttributes(abide: ProjectData) -> None: assert ( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" in project_attributes ) or ("type" in project_attributes) - assert ("AcquisitionModality") in project_attributes - assert ("ImageContrastType") in project_attributes - assert ("Task") in project_attributes - assert ("ImageUsageType") in project_attributes + assert "AcquisitionModality" in project_attributes + assert "ImageContrastType" in project_attributes + assert "Task" in project_attributes + assert "ImageUsageType" in project_attributes def test_download_cde_files(): diff --git a/tests/experiment/tools/test_rest.py b/tests/experiment/tools/test_rest.py index aae13fd..daad97f 100644 --- a/tests/experiment/tools/test_rest.py +++ b/tests/experiment/tools/test_rest.py @@ -101,14 +101,14 @@ def makeTestFile(dirpath: Path, filename: str, params: dict) -> RestTest: acq2 = Acquisition(uuid="_acq2", session=session) acq3 = Acquisition(uuid="_acq2", session=session) - person = acq.add_person(attributes=({Constants.NIDM_SUBJECTID: "a1_9999"})) + person = acq.add_person(attributes={Constants.NIDM_SUBJECTID: "a1_9999"}) test_person_uuid = (str(person.identifier)).replace("niiri:", "") acq.add_qualified_association(person=person, role=Constants.NIDM_PARTICIPANT) - person2 = acq2.add_person(attributes=({Constants.NIDM_SUBJECTID: "a1_8888"})) + person2 = acq2.add_person(attributes={Constants.NIDM_SUBJECTID: "a1_8888"}) acq2.add_qualified_association(person=person2, role=Constants.NIDM_PARTICIPANT) - person3 = acq3.add_person(attributes=({Constants.NIDM_SUBJECTID: "a2_7777"})) + person3 = acq3.add_person(attributes={Constants.NIDM_SUBJECTID: "a2_7777"}) acq2.add_qualified_association(person=person3, role=Constants.NIDM_PARTICIPANT) project2 = Project(uuid=project_uuid2, attributes=p2kwargs) @@ -116,9 +116,9 @@ def makeTestFile(dirpath: Path, filename: str, params: dict) -> RestTest: acq4 = Acquisition(uuid="_acq3", session=session2) acq5 = Acquisition(uuid="_acq4", session=session2) - person4 = acq4.add_person(attributes=({Constants.NIDM_SUBJECTID: "a3_6666"})) + person4 = acq4.add_person(attributes={Constants.NIDM_SUBJECTID: "a3_6666"}) acq4.add_qualified_association(person=person4, role=Constants.NIDM_PARTICIPANT) - person5 = acq5.add_person(attributes=({Constants.NIDM_SUBJECTID: "a4_5555"})) + person5 = acq5.add_person(attributes={Constants.NIDM_SUBJECTID: "a4_5555"}) acq5.add_qualified_association(person=person5, role=Constants.NIDM_PARTICIPANT) # now add some assessment instrument data @@ -349,10 +349,10 @@ def test_brain_vols(brain_vol: BrainVol) -> None: data = Query.GetDerivativesDataForSubject(brain_vol.files, None, subject) assert len(data) > 0 - for key in data: - assert "StatCollectionType" in data[key] - assert "URI" in data[key] - assert "values" in data[key] + for value in data.values(): + assert "StatCollectionType" in value + assert "URI" in value + assert "values" in value def test_GetParticipantDetails(brain_vol: BrainVol) -> None: @@ -388,10 +388,10 @@ def test_CheckSubjectMatchesFilter(brain_vol: BrainVol) -> None: derivatives = Query.GetDerivativesDataForSubject(brain_vol.files, project, subject) - for skey in derivatives: - for vkey in derivatives[skey]["values"]: + for svalue in derivatives.values(): + for vkey in svalue["values"]: dt = vkey - val = derivatives[skey]["values"][vkey]["value"] + val = svalue["values"][vkey]["value"] if dt and val: break @@ -402,7 +402,7 @@ def test_CheckSubjectMatchesFilter(brain_vol: BrainVol) -> None: ) instruments = Query.GetParticipantInstrumentData(brain_vol.files, project, subject) - for _, inst in instruments.items(): + for inst in instruments.values(): if "AGE_AT_SCAN" in inst: age = inst["AGE_AT_SCAN"] @@ -551,7 +551,6 @@ def assess_one_col_output(txt_output): # if we didn't find a line with a uuid then we simply flag a false assertion and return the first line of output # cause it doesn't really matter at this point the assertion already failed raise AssertionError - return lines[0] def is_uuid(uuid): @@ -621,7 +620,7 @@ def test_cli_rest_routes(brain_vol: BrainVol) -> None: summary_lines = ( sections[0].strip().splitlines()[1:-1] ) # first and last lines should be ----- - summary = dict() + summary = {} for ln in summary_lines: summary[ln.split()[0]] = ln.split()[1] inst_uuid = summary["instruments"].split(",")[0] @@ -665,7 +664,7 @@ def test_multiple_project_fields(brain_vol: BrainVol) -> None: print(fields) fv = fields assert type(fv) == list - fields_used = set([i.label for i in fv]) + fields_used = {i.label for i in fv} assert ("brain" in fields_used) or ( "Brain Segmentation Volume (mm^3)" in fields_used ) @@ -742,7 +741,7 @@ def test_project_fields_not_found(brain_vol: BrainVol) -> None: project = rest_parser.run( brain_vol.files, f"/projects/{brain_vol.cmu_test_project_uuid}?fields={field}" ) - keys = {i for i in project} + keys = set(project) assert "error" in keys