diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv index e8875c11ec..e94b9aa50a 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Field_Level.csv @@ -1,67 +1,19 @@ table_name,column_name,data_type,is_nullable,is_suppressed,rule -condition_occurrence,condition_source_value,STRING,YES,YES,DC-1370 -condition_occurrence,condition_status_source_value,STRING,YES,YES,DC-1370 -condition_occurrence,stop_reason,STRING,YES,YES,DC-1370 -death,cause_source_value,STRING,YES,YES,DC-1370 -device_exposure,device_source_value,STRING,YES,YES,DC-1370 -device_exposure,unique_device_id,STRING,YES,YES,DC-1370 -drug_exposure,dose_unit_source_value,STRING,YES,YES,DC-1370 -drug_exposure,drug_source_value,STRING,YES,YES,DC-1370 -drug_exposure,lot_number,STRING,YES,YES,DC-1370 -drug_exposure,route_source_value,STRING,YES,YES,DC-1370 -drug_exposure,sig,STRING,YES,YES,DC-1370 -drug_exposure,stop_reason,STRING,YES,YES,DC-1370 measurement,measurement_source_value,STRING,YES,YES,DC-1370 measurement,unit_source_value,STRING,YES,YES,DC-1370 measurement,value_source_value,STRING,YES,YES,DC-1370 -note,note_source_value,STRING,YES,YES,DC-1370 -note,note_text,STRING,NO,YES,DC-1370 -note,note_title,STRING,NO,YES,DC-1370 observation,observation_source_value,STRING,YES,YES,DC-1370 observation,qualifier_source_value,STRING,YES,YES,DC-1370 observation,unit_source_value,STRING,YES,YES,DC-1370 observation,value_as_string,STRING,YES,YES,DC-1370 observation,value_source_value,STRING,YES,YES,DC-1370 -person,ethnicity_source_value,STRING,YES,YES,DC-1370 -person,gender_source_value,STRING,YES,YES,DC-1370 -person,person_source_value,STRING,YES,YES,DC-1370 -person,race_source_value,STRING,YES,YES,DC-1370 -procedure_occurrence,procedure_source_value,STRING,YES,YES,DC-1370 -procedure_occurrence,modifier_source_value,STRING,YES,YES,DC-1370 -specimen,anatomic_site_source_value,STRING,YES,YES,DC-1370 -specimen,disease_status_source_value,STRING,YES,YES,DC-1370 -specimen,specimen_source_id,STRING,YES,YES,DC-1370 -specimen,specimen_source_value,STRING,YES,YES,DC-1370 -specimen,unit_source_value,STRING,YES,YES,DC-1370 visit_occurrence,admitting_source_value,STRING,YES,YES,DC-1370 visit_occurrence,discharge_to_source_value,STRING,YES,YES,DC-1370 visit_occurrence,visit_source_value,STRING,YES,YES,DC-1370 -person,month_of_birth,INT64,YES,YES,DC-1373 -person,day_of_birth,INT64,YES,YES,DC-1373 -person,birth_datetime,TIMESTAMP,YES,YES,DC-1373 -person,location_id,INT64,YES,YES,DC-1373 -person,provider_id,INT64,YES,YES,DC-1373 -person,care_site_id,INT64,YES,YES,DC-1373 observation,provider_id,INT64,YES,YES,DC-1373 visit_occurrence,provider_id,INT64,YES,YES,DC-1373 visit_occurrence,care_site_id,INT64,YES,YES,DC-1373 -care_site_ext,care_site_id,INT64,YES,YES,DC-1373 measurement,provider_id,INT64,YES,YES,DC-1373 -provider,provider_id,INT64,NO,YES,DC-1373 -provider,care_site_id,INT64,YES,YES,DC-1373 -procedure_occurrence,provider_id,INT64,YES,YES,DC-1373 -care_site,care_site_id,INT64,NO,YES,DC-1373 -care_site,location_id,INT64,YES,YES,DC-1373 -device_exposure,provider_id,INT64,YES,YES,DC-1373 -location_ext,location_id,INT64,YES,YES,DC-1373 -location,location_id,INT64,NO,YES,DC-1373 -condition_occurrence,provider_id,INT64,YES,YES,DC-1373 -drug_exposure,provider_id,INT64,YES,YES,DC-1373 -note,provider_id,INT64,YES,YES,DC-1373 -provider_ext,provider_id,INT64,YES,YES,DC-1373 -person,month_of_birth,INT64,YES,YES,DC-1357 -person,day_of_birth,INT64,YES,YES,DC-1357 -person,birth_datetime,TIMESTAMP,YES,YES,DC-1357 observation,observation_concept_id,INT64,NO,,DC-1368 observation,observation_type_concept_id,INT64,NO,,DC-1368 observation,value_as_concept_id,INT64,YES,,DC-1368 @@ -69,78 +21,17 @@ observation,qualifier_concept_id,INT64,YES,,DC-1368 observation,unit_concept_id,INT64,YES,,DC-1368 observation,observation_source_concept_id,INT64,YES,,DC-1368 observation,value_source_concept_id,INT64,YES,,DC-1368 -drug_era,drug_concept_id,INT64,NO,,DC-1368 -fact_relationship,relationship_concept_id,INT64,NO,,DC-1368 -observation_period,period_type_concept_id,INT64,NO,,DC-1368 -procedure_cost,currency_concept_id,INT64,YES,,DC-1368 -procedure_cost,revenue_code_concept_id,INT64,YES,,DC-1368 visit_occurrence,visit_concept_id,INT64,NO,,DC-1368 visit_occurrence,visit_type_concept_id,INT64,NO,,DC-1368 visit_occurrence,visit_source_concept_id,INT64,YES,,DC-1368 visit_occurrence,admitting_source_concept_id,INT64,YES,,DC-1368 visit_occurrence,discharge_to_concept_id,INT64,YES,,DC-1368 -drug_strength,drug_concept_id,INT64,NO,,DC-1368 -drug_strength,ingredient_concept_id,INT64,NO,,DC-1368 -drug_strength,amount_unit_concept_id,INT64,YES,,DC-1368 -drug_strength,numerator_unit_concept_id,INT64,YES,,DC-1368 -drug_strength,denominator_unit_concept_id,INT64,YES,,DC-1368 -condition_era,condition_concept_id,INT64,NO,,DC-1368 measurement,measurement_concept_id,INT64,NO,,DC-1368 measurement,measurement_type_concept_id,INT64,NO,,DC-1368 measurement,operator_concept_id,INT64,YES,,DC-1368 measurement,value_as_concept_id,INT64,YES,,DC-1368 measurement,unit_concept_id,INT64,YES,,DC-1368 measurement,measurement_source_concept_id,INT64,YES,,DC-1368 -visit_cost,currency_concept_id,INT64,YES,,DC-1368 -provider,specialty_concept_id,INT64,YES,,DC-1368 -provider,gender_concept_id,INT64,YES,,DC-1368 -provider,specialty_source_concept_id,INT64,YES,,DC-1368 -provider,gender_source_concept_id,INT64,YES,,DC-1368 -person,gender_concept_id,INT64,NO,,DC-1368 -person,race_concept_id,INT64,NO,,DC-1368 -person,ethnicity_concept_id,INT64,NO,,DC-1368 -person,gender_source_concept_id,INT64,YES,,DC-1368 -person,race_source_concept_id,INT64,YES,,DC-1368 -person,ethnicity_source_concept_id,INT64,YES,,DC-1368 -drug_cost,currency_concept_id,INT64,YES,,DC-1368 -cohort_attribute,value_as_concept_id,INT64,YES,,DC-1368 -procedure_occurrence,procedure_concept_id,INT64,NO,,DC-1368 -procedure_occurrence,procedure_type_concept_id,INT64,NO,,DC-1368 -procedure_occurrence,modifier_concept_id,INT64,YES,,DC-1368 -procedure_occurrence,procedure_source_concept_id,INT64,YES,,DC-1368 -care_site,place_of_service_concept_id,INT64,YES,,DC-1368 -specimen,specimen_concept_id,INT64,NO,,DC-1368 -specimen,specimen_type_concept_id,INT64,NO,,DC-1368 -specimen,unit_concept_id,INT64,YES,,DC-1368 -specimen,anatomic_site_concept_id,INT64,YES,,DC-1368 -specimen,disease_status_concept_id,INT64,YES,,DC-1368 -death,death_type_concept_id,INT64,NO,,DC-1368 -death,cause_concept_id,INT64,YES,,DC-1368 -death,cause_source_concept_id,INT64,YES,,DC-1368 -device_exposure,device_concept_id,INT64,NO,,DC-1368 -device_exposure,device_type_concept_id,INT64,NO,,DC-1368 -device_exposure,device_source_concept_id,INT64,YES,,DC-1368 -device_cost,currency_concept_id,INT64,YES,,DC-1368 -condition_occurrence,condition_concept_id,INT64,NO,,DC-1368 -condition_occurrence,condition_type_concept_id,INT64,NO,,DC-1368 -condition_occurrence,condition_source_concept_id,INT64,YES,,DC-1368 -condition_occurrence,condition_status_concept_id,INT64,YES,,DC-1368 -cohort_definition,definition_type_concept_id,INT64,NO,,DC-1368 -cohort_definition,subject_concept_id,INT64,NO,,DC-1368 -attribute_definition,attribute_type_concept_id,INT64,NO,,DC-1368 -drug_exposure,drug_concept_id,INT64,NO,,DC-1368 -drug_exposure,drug_type_concept_id,INT64,NO,,DC-1368 -drug_exposure,route_concept_id,INT64,YES,,DC-1368 -drug_exposure,drug_source_concept_id,INT64,YES,,DC-1368 -note,note_type_concept_id,INT64,NO,,DC-1368 -note,note_class_concept_id,INT64,NO,,DC-1368 -note,encoding_concept_id,INT64,NO,,DC-1368 -note,language_concept_id,INT64,NO,,DC-1368 -cost,cost_type_concept_id,INT64,NO,,DC-1368 -cost,currency_concept_id,INT64,YES,,DC-1368 -cost,revenue_code_concept_id,INT64,YES,,DC-1368 -dose_era,drug_concept_id,INT64,NO,,DC-1368 -dose_era,unit_concept_id,INT64,NO,,DC-1368 observation,observation_concept_id,INT64,NO,,DC-1388 observation,observation_type_concept_id,INT64,NO,,DC-1388 observation,value_as_concept_id,INT64,YES,,DC-1388 diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Mapping.csv b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Mapping.csv index cb1992df96..998ab968ea 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Mapping.csv +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Mapping.csv @@ -1,29 +1 @@ table_name,column_name,data_type,primary_key,mapping_table,new_id,rule -observation,person_id,INT64,observation_id,_deid_map,research_id,DC-1346 -drug_era,person_id,INT64,drug_era_id,_deid_map,research_id,DC-1346 -observation_period,person_id,INT64,observation_period_id,_deid_map,research_id,DC-1346 -visit_occurrence,person_id,INT64,visit_occurrence_id,_deid_map,research_id,DC-1346 -measurement,person_id,INT64,measurement_id,_deid_map,research_id,DC-1346 -person,person_id,INT64,person_id,_deid_map,research_id,DC-1346 -procedure_occurrence,person_id,INT64,procedure_occurrence_id,_deid_map,research_id,DC-1346 -specimen,person_id,INT64,specimen_id,_deid_map,research_id,DC-1346 -device_exposure,person_id,INT64,device_exposure_id,_deid_map,research_id,DC-1346 -payer_plan_period,person_id,INT64,payer_plan_period_id,_deid_map,research_id,DC-1346 -condition_occurrence,person_id,INT64,condition_occurrence_id,_deid_map,research_id,DC-1346 -drug_exposure,person_id,INT64,drug_exposure_id,_deid_map,research_id,DC-1346 -note,person_id,INT64,note_id,_deid_map,research_id,DC-1346 -dose_era,person_id,INT64,dose_era_id,_deid_map,research_id,DC-1346 -survey_conduct,person_id,INT64,survey_conduct_id,_deid_map,research_id,DC-1346 -observation,questionnaire_response_id,INT64,observation_id,_deid_questionnaire_response_map,research_response_id,DC-1348 -survey_conduct,survey_conduct_id,INT64,survey_conduct_id,_deid_questionnaire_response_map,research_response_id,DC-1348 -survey_conduct,survey_source_identifier,STRING,survey_conduct_id,_deid_questionnaire_response_map,research_response_id,DC-1348 -observation,value_as_string,STRING,observation_id,_zip_map,,DC-1377 -care_site_ext,src_id,STRING,care_site_id,site_maskings,src_id,DC-1355 -visit_occurrence_ext,src_id,STRING,visit_occurrence_id,site_maskings,src_id,DC-1355 -procedure_occurrence_ext,src_id,STRING,procedure_occurrence_id,site_maskings,src_id,DC-1355 -drug_exposure_ext,src_id,STRING,drug_exposure_id,site_maskings,src_id,DC-1355 -provider_ext,src_id,STRING,provider_id,site_maskings,src_id,DC-1355 -device_exposure_ext,src_id,STRING,device_exposure_id,site_maskings,src_id,DC-1355 -condition_occurrence_ext,src_id,STRING,condition_occurrence_id,site_maskings,src_id,DC-1355 -observation_ext,src_id,STRING,observation_id,site_maskings,src_id,DC-1355 -measurement_ext,src_id,STRING,measurement_id,site_maskings,src_id,DC-1355 diff --git a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Table_Level.csv b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Table_Level.csv index 407520c3d3..ed2688bfa3 100644 --- a/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Table_Level.csv +++ b/data_steward/analytics/cdr_ops/controlled_tier_qc/csv/Controlled_Tier_Table_Level.csv @@ -1,5 +1,2 @@ table_name,rule -note,DC-1362 -location,DC-1362 -care_site,DC-1362 -provider,DC-1362 + diff --git a/data_steward/cdr_cleaner/clean_cdr.py b/data_steward/cdr_cleaner/clean_cdr.py index 3e0c6c9c92..b1ef6aabb7 100644 --- a/data_steward/cdr_cleaner/clean_cdr.py +++ b/data_steward/cdr_cleaner/clean_cdr.py @@ -19,8 +19,6 @@ from cdr_cleaner.cleaning_rules.convert_pre_post_coordinated_concepts import ConvertPrePostCoordinatedConcepts from cdr_cleaner.cleaning_rules.create_aian_lookup import CreateAIANLookup from cdr_cleaner.cleaning_rules.create_expected_ct_list import StoreExpectedCTList -from cdr_cleaner.cleaning_rules.deid.ct_additional_privacy_suppression import CTAdditionalPrivacyConceptSuppression -from cdr_cleaner.cleaning_rules.deid.ct_observation_privacy_suppression import CTObservationPrivacySuppression from cdr_cleaner.cleaning_rules.deid.ct_nph_observation_privacy_suppression import CTNPHObservationPrivacySuppression from cdr_cleaner.cleaning_rules.deid.rt_additional_privacy_suppression import RTAdditionalPrivacyConceptSuppression from cdr_cleaner.cleaning_rules.deid.rt_observation_privacy_suppression import RTObservationPrivacySuppression @@ -354,42 +352,48 @@ (FitbitDateShiftRule,), ] -CONTROLLED_TIER_DEID_CLEANING_CLASSES = [ - (RtCtPIDtoRID,), - (QRIDtoRID,), # Should run before any row suppression rules - (TruncateEraTables,), - (NullPersonBirthdate,), - (TableSuppression,), +NPH_CONTROLLED_TIER_DEID_CLEANING_CLASSES = [ + (CleanPPINumericFieldsUsingParameters,), (ControlledTierReplacedConceptSuppression,), (GeneralizeZipCodes,), # Should run after any data remapping rules - # (RaceEthnicityRecordSuppression,), # Should run after any data remapping rules + (RaceEthnicityRecordSuppression,), # Should run after any data remapping rules ( MotorVehicleAccidentSuppression,), (VehicularAccidentConceptSuppression,), (ExplicitIdentifierSuppression,), (GeoLocationConceptSuppression,), (BirthInformationSuppression,), - (YearOfBirthRecordsSuppression,), - (ControlledCopeSurveySuppression,), (IDFieldSuppression,), # Should run after any data remapping (CancerConceptSuppression,), # Should run after any data remapping rules (SectionParticipationConceptSuppression,), - (CTAdditionalPrivacyConceptSuppression,), - (CTObservationPrivacySuppression,), - # (CTNPHObservationPrivacySuppression,), # Applies only to NPH data + (CTNPHObservationPrivacySuppression,), # Applies only to NPH data. will be dealt with in 2.0 when handling row suppressions ( StringFieldsSuppression,), (AggregateZipCodes,), (DeidentifyAIANZip3Values,), (FreeTextSurveyResponseSuppression,), - (DropOrphanedSurveyConductIds,), - (DropOrphanedPIDS,), - (GenerateWearStudyTable,), - (DropViaSurveyConduct,), # should run after wear study table creation - (RemoveExtraTables,), # Should be last cleaning rule to be run - (CalculatePrimaryDeathRecord,), - (CleanMappingExtTables,), # should be one of the last cleaning rules run -] + (FillSourceValueTextFields,), + ] + +CONTROLLED_TIER_DEID_CLEANING_CLASSES = [ + (RtCtPIDtoRID,), + (ControlledTierReplacedConceptSuppression,), + (GeneralizeZipCodes,), # Should run after any data remapping rules + (RaceEthnicityRecordSuppression,), # Should run after any data remapping rules + (MotorVehicleAccidentSuppression,), + (VehicularAccidentConceptSuppression,), + (ExplicitIdentifierSuppression,), + (GeoLocationConceptSuppression,), + (BirthInformationSuppression,), + (YearOfBirthRecordsSuppression,), + (IDFieldSuppression,), # Should run after any data remapping + (CancerConceptSuppression,), # Should run after any data remapping rules + (SectionParticipationConceptSuppression,), + (StringFieldsSuppression,), + (AggregateZipCodes,), + (DeidentifyAIANZip3Values,), + (FreeTextSurveyResponseSuppression,) + ] CONTROLLED_TIER_DEID_BASE_CLEANING_CLASSES = [ (FillSourceValueTextFields,), @@ -451,6 +455,8 @@ REGISTERED_TIER_FITBIT_CLEANING_CLASSES, DataStage.CONTROLLED_TIER_DEID.value: CONTROLLED_TIER_DEID_CLEANING_CLASSES, + DataStage.NPH_CONTROLLED_TIER_DEID.value: + NPH_CONTROLLED_TIER_DEID_CLEANING_CLASSES, DataStage.CONTROLLED_TIER_DEID_BASE.value: CONTROLLED_TIER_DEID_BASE_CLEANING_CLASSES, DataStage.CONTROLLED_TIER_DEID_CLEAN.value: diff --git a/data_steward/cdr_cleaner/cleaning_rules/clean_ppi_numeric_fields_using_parameters.py b/data_steward/cdr_cleaner/cleaning_rules/clean_ppi_numeric_fields_using_parameters.py index 84f537784a..056b756609 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/clean_ppi_numeric_fields_using_parameters.py +++ b/data_steward/cdr_cleaner/cleaning_rules/clean_ppi_numeric_fields_using_parameters.py @@ -128,7 +128,9 @@ END AS value_source_concept_id, value_source_value, - questionnaire_response_id + -- questionnaire_response_id -- + -- an NPH specific changes that needs to be addressed before 1.0 executions are run again. -- + form_name FROM {{project}}.{{dataset}}.observation""") @@ -159,7 +161,7 @@ def __init__(self, project_id, dataset_id, sandbox_dataset_id): 'DC1058', 'DC1061', 'DC827', 'DC502', 'DC487', 'DC2475', 'DC2649' ], description=desc, - affected_datasets=[cdr_consts.RDR], + affected_datasets=[cdr_consts.RDR, cdr_consts.NPH_CONTROLLED_TIER_DEID], affected_tables=['observation'], project_id=project_id, dataset_id=dataset_id, diff --git a/data_steward/cdr_cleaner/cleaning_rules/deid/string_fields_suppression.py b/data_steward/cdr_cleaner/cleaning_rules/deid/string_fields_suppression.py index 617cc035e2..fa723b0f6f 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/deid/string_fields_suppression.py +++ b/data_steward/cdr_cleaner/cleaning_rules/deid/string_fields_suppression.py @@ -31,7 +31,7 @@ FROM `{{project}}.{{dataset}}.{{domain_table}}` AS d LEFT JOIN `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` AS s ON d.{{domain_table}}_id = s.{{domain_table}}_id -WHERE d.{{field_name}} = {{field_value}} and s.{{domain_table}}_id IS NULL +WHERE d.{{field_name}} in ({{field_value}}) and s.{{domain_table}}_id IS NULL """) STRING_FIELD_SUPPRESSION_QUERY_TEMPLATE = JINJA_ENV.from_string(""" @@ -56,7 +56,7 @@ {% endfor %} FROM `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` AS s WHERE d.{{domain_table}}_id = s.{{domain_table}}_id - AND s.{{field_name}} = {{field_value}} + AND s.{{field_name}} in ({{field_value}}) """) VALIDATION_QUERY_TEMPLATE = JINJA_ENV.from_string(""" @@ -107,7 +107,8 @@ def get_string_fields(domain_table): else: fields = [ field for field in resources.fields_for(domain_table) - if field['type'] == 'string' + if field['type'] == 'string' and + field['name'] not in ['form_name'] # NPH release includes form_name ] return fields @@ -210,7 +211,16 @@ def get_rule_exceptions(self) -> List[SuppressionException]: sandbox_table=self.sandbox_table_for(OBSERVATION), field_name=OBSERVATION_SOURCE_CONCEPT_ID, field_value=APPROXIMATE_DATE_OF_SYMPTOMS, - restore_fields=[VALUE_AS_STRING]) + restore_fields=[VALUE_AS_STRING]), + SuppressionException( + domain_table=OBSERVATION, + sandbox_table=self.sandbox_table_for(OBSERVATION), + field_name=OBSERVATION_SOURCE_CONCEPT_ID, + field_value=(f"SELECT concept_id " + f"FROM `{self.project_id}.{self.dataset_id}.concept` " + f"WHERE concept_code in ('cgm_dev_given_id', 'polar_id', 'poloar_actigraph_pair_id', 'device_notcollected', 'scr_studyid', 'profile_zipcode', 'geo_curr_hmzipcode', 'geo_curr_hmcountry')"), + restore_fields=[VALUE_AS_STRING] + ) ] def get_query_specs(self, *args, **keyword_args) -> query_spec_list: diff --git a/data_steward/cdr_cleaner/cleaning_rules/fill_source_value_text_fields.py b/data_steward/cdr_cleaner/cleaning_rules/fill_source_value_text_fields.py index e8f59d67b4..e62ea9d24f 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/fill_source_value_text_fields.py +++ b/data_steward/cdr_cleaner/cleaning_rules/fill_source_value_text_fields.py @@ -74,7 +74,10 @@ def get_affected_tables(): - return resources.CDM_TABLES + [AOU_DEATH] + # return resources.CDM_TABLES + [AOU_DEATH] + # this needs to be fixed for the 1.0 CDR execution. This only impacts + # observation for the NPH data + return [OBSERVATION] def get_fields_dict(table_name, fields): @@ -138,7 +141,7 @@ def get_fields_dict(table_name, fields): return fields_to_replace -def get_modified_columns(fields, fields_to_replace, table=None): +def get_modified_columns(fields, fields_to_replace, table=None, project_id=None, dataset_id=None): """ This method updates the columns by adding prefix to each column if the column is being replaced and @@ -154,14 +157,24 @@ def get_modified_columns(fields, fields_to_replace, table=None): if field in fields_to_replace: if table == OBSERVATION and field == 'value_as_string': col_expr = """ - IF(observation_source_concept_id = 1585250 AND REGEXP_CONTAINS(value_as_string, r'\*\*$'), + IF((observation_source_concept_id = 1585250 AND REGEXP_CONTAINS(value_as_string, r'\*\*$')) + OR (observation_source_concept_id in ( + SELECT concept_id + FROM `{project_id}.{dataset_id}.concept` + WHERE lower(concept_code) IN ('cgm_dev_given_id', 'polar_id', 'poloar_actigraph_pair_id', 'device_notcollected', 'scr_studyid', 'profile_zipcode', 'geo_curr_hmzipcode', 'geo_curr_hmcountry'))), {name}, {prefix}.concept_code) as {name} """.format(prefix=fields_to_replace[field]['prefix'], - name=fields_to_replace[field]['name']) + name=fields_to_replace[field]['name'], + project_id=project_id, + dataset_id=dataset_id) else: col_expr = '{prefix}.concept_code as {name}'.format( prefix=fields_to_replace[field]['prefix'], name=fields_to_replace[field]['name']) + elif field == "questionnaire_response_id": + # this needs to be programmatically removed as it will ignore this field in the CDR + # if another 1.0 execution is run + continue else: col_expr = field col_exprs.append(col_expr) @@ -209,7 +222,8 @@ def __init__(self, description=desc, affected_datasets=[ cdr_consts.REGISTERED_TIER_DEID_BASE, - cdr_consts.CONTROLLED_TIER_DEID_BASE + cdr_consts.CONTROLLED_TIER_DEID_BASE, + cdr_consts.NPH_CONTROLLED_TIER_DEID ], affected_tables=get_affected_tables(), depends_on=[StringFieldsSuppression], @@ -233,7 +247,9 @@ def get_query_specs(self, *args, **keyword_args) -> query_spec_list: if fields_to_replace: cols = get_modified_columns(fields, fields_to_replace, - table=table) + table=table, + project_id=self.project_id, + dataset_id=self.dataset_id) full_join_expression = get_full_join_expression( self.dataset_id, self.project_id, fields_to_replace) diff --git a/data_steward/cdr_cleaner/cleaning_rules/identifying_field_suppression.py b/data_steward/cdr_cleaner/cleaning_rules/identifying_field_suppression.py index 0333724343..0026126c3f 100644 --- a/data_steward/cdr_cleaner/cleaning_rules/identifying_field_suppression.py +++ b/data_steward/cdr_cleaner/cleaning_rules/identifying_field_suppression.py @@ -32,7 +32,7 @@ # Project imports from constants import bq_utils as bq_consts from constants.cdr_cleaner import clean_cdr as cdr_consts -from common import JINJA_ENV, CDM_TABLES +from common import JINJA_ENV, NPH_TABLES from utils import pipeline_logging from cdr_cleaner.cleaning_rules.base_cleaning_rule import BaseCleaningRule, query_spec_list from cdr_cleaner.cleaning_rules.table_suppression import TableSuppression @@ -85,7 +85,7 @@ def __init__(self, issue_numbers=JIRA_ISSUE_NUMBERS, description=desc, affected_datasets=[cdr_consts.COMBINED], - affected_tables=CDM_TABLES, + affected_tables=NPH_TABLES, project_id=project_id, dataset_id=dataset_id, sandbox_dataset_id=sandbox_dataset_id, diff --git a/data_steward/common.py b/data_steward/common.py index f608b87479..a0147d1fa2 100644 --- a/data_steward/common.py +++ b/data_steward/common.py @@ -97,7 +97,10 @@ ] + OTHER_CLINICAL_TABLES CDM_TABLES = AOU_REQUIRED + OTHER_CDM_TABLES - +NPH_TABLES = [MEASUREMENT, OBSERVATION, VISIT_OCCURRENCE, VISIT_DETAIL] +NPH_VOCABULARY_TABLES = ["concept", "concept_ancestor", "concept_relationship", "vocabulary", + # "concept_class", "concept_synonym", "domain", "drug_strength", "relationship" # Release 2 + ] # AoU custom tables AOU_DEATH = 'aou_death' AOU_CUSTOM_TABLES = [AOU_DEATH] diff --git a/data_steward/constants/cdr_cleaner/clean_cdr.py b/data_steward/constants/cdr_cleaner/clean_cdr.py index 95e6f25a07..7205429a72 100644 --- a/data_steward/constants/cdr_cleaner/clean_cdr.py +++ b/data_steward/constants/cdr_cleaner/clean_cdr.py @@ -51,6 +51,7 @@ FUNCTION_NAME_DEFAULT_VALUE = 'Unknown function' LINE_NO_DEFAULT_VALUE = 'Unknown line number' +NPH_CONTROLLED_TIER_DEID = 'nph_controlled_tier_deid' @unique class DataStage(Enum): @@ -68,6 +69,7 @@ class DataStage(Enum): CONTROLLED_TIER_DEID_BASE = CONTROLLED_TIER_DEID_BASE CONTROLLED_TIER_DEID_CLEAN = CONTROLLED_TIER_DEID_CLEAN CONTROLLED_TIER_FITBIT = CONTROLLED_TIER_FITBIT + NPH_CONTROLLED_TIER_DEID = NPH_CONTROLLED_TIER_DEID DATA_CONSISTENCY = DATA_CONSISTENCY CRON_RETRACTION = CRON_RETRACTION diff --git a/data_steward/gcloud/bq/__init__.py b/data_steward/gcloud/bq/__init__.py index f8d56a6b76..5621603eaa 100644 --- a/data_steward/gcloud/bq/__init__.py +++ b/data_steward/gcloud/bq/__init__.py @@ -17,22 +17,16 @@ from google.auth import default from google.api_core.exceptions import GoogleAPIError, BadRequest, Conflict from google.cloud.exceptions import NotFound -from opentelemetry import trace -from opentelemetry.exporter.cloud_trace import CloudTraceSpanExporter -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor # Project imports from utils import auth from resources import fields_for, get_and_validate_schema_fields, replace_special_characters_for_labels, \ is_rdr_dataset, is_mapping_table from constants.utils import bq as consts -from common import JINJA_ENV, IDENTITY_MATCH, PARTICIPANT_MATCH, PIPELINE_TABLES, SITE_MASKING_TABLE_ID +from common import JINJA_ENV, IDENTITY_MATCH, PARTICIPANT_MATCH, PIPELINE_TABLES, SITE_MASKING_TABLE_ID, NPH_TABLES, \ + NPH_VOCABULARY_TABLES from resources import get_bq_col_type -tracer_provider = TracerProvider() -trace.set_tracer_provider(tracer_provider) -tracer = trace.get_tracer(__name__) BIGQUERY_DATA_TYPES = { 'integer': 'INT64', @@ -62,16 +56,11 @@ def __init__(self, project_id: str, scopes=None, credentials=None): :return: A BigQueryClient instance """ - cloud_trace_exporter = CloudTraceSpanExporter(project_id=project_id) - tracer_provider.add_span_processor( - BatchSpanProcessor(cloud_trace_exporter)) - # TODO create counter to keep track of multiple client instances - with tracer.start_as_current_span(project_id): - if scopes: - credentials, project_id = default() - credentials = auth.delegated_credentials(credentials, - scopes=scopes) - super().__init__(project=project_id, credentials=credentials) + if scopes: + credentials, project_id = default() + credentials = auth.delegated_credentials(credentials, + scopes=scopes) + super().__init__(project=project_id, credentials=credentials) def get_table_schema(self, table_name: str, fields=None) -> list: """ @@ -214,7 +203,7 @@ def define_dataset(self, dataset_id: str, description: str, dataset = bigquery.Dataset(dataset_id) dataset.description = description dataset.labels = label_or_tag - dataset.location = "US" + dataset.location = "us-central1" return dataset @@ -467,6 +456,10 @@ def build_and_copy_contents(self, src_dataset: str, dest_dataset: str): """ table_list = self.list_tables(src_dataset) + tables_to_copy = NPH_VOCABULARY_TABLES + NPH_TABLES + + table_list = [table_item for table_item in table_list if table_item.table_id in tables_to_copy] + for table_item in table_list: # create empty schemaed table with client object try: @@ -483,7 +476,11 @@ def build_and_copy_contents(self, src_dataset: str, dest_dataset: str): if schema_list: sc_list = [] for item in schema_list: - field_cast = f'CAST({item.name} AS {BIGQUERY_DATA_TYPES[item.field_type.lower()]}) AS {item.name}' + # NPH does not provide AOU-specific fields, skip them + if item.name in ['questionnaire_response_id']: + continue + else: + field_cast = f'CAST({item.name} AS {BIGQUERY_DATA_TYPES[item.field_type.lower()]}) AS {item.name}' sc_list.append(field_cast) fields_name_str = ',\n'.join(sc_list) diff --git a/data_steward/resource_files/schemas/cdm/clinical/measurement.json b/data_steward/resource_files/schemas/cdm/clinical/measurement.json index e352df0eda..6e543b1df9 100644 --- a/data_steward/resource_files/schemas/cdm/clinical/measurement.json +++ b/data_steward/resource_files/schemas/cdm/clinical/measurement.json @@ -117,5 +117,11 @@ "name": "value_source_value", "mode": "nullable", "description": "The source value associated with the content of the value_as_number or value_as_concept_id as stored in the source data." + }, + { + "type": "string", + "name": "form_name", + "mode": "nullable", + "description": "" } ] \ No newline at end of file diff --git a/data_steward/resource_files/schemas/cdm/clinical/observation.json b/data_steward/resource_files/schemas/cdm/clinical/observation.json index 1046c83777..41e590660c 100644 --- a/data_steward/resource_files/schemas/cdm/clinical/observation.json +++ b/data_steward/resource_files/schemas/cdm/clinical/observation.json @@ -124,5 +124,11 @@ "name": "questionnaire_response_id", "mode": "nullable", "description": "An ID for a questionnaire response that produced this observation. This is applicable to AllOfUs questionnaire answers only. All answers with the same questionnaire response ID were submitted in the same response." + }, + { + "type": "string", + "name": "form_name", + "mode": "nullable", + "description": "" } ] \ No newline at end of file diff --git a/data_steward/tools/create_tier.py b/data_steward/tools/create_tier.py index 2bd1c257bb..99c0eb2fa1 100644 --- a/data_steward/tools/create_tier.py +++ b/data_steward/tools/create_tier.py @@ -19,7 +19,7 @@ LOGGER = logging.getLogger(__name__) -TIER_LIST = ['controlled', 'registered'] +TIER_LIST = ['nph_controlled', 'controlled', 'registered'] DEID_STAGE_LIST = ['deid', 'deid_base', 'deid_clean', 'fitbit_deid'] @@ -79,7 +79,7 @@ def get_dataset_name(tier, release_tag, deid_stage): # validate parameters validate_create_tier_args(tier, deid_stage, release_tag) - tier = tier[0].upper() + tier = 'C' if 'controlled' in tier else 'R' dataset_name = f"{tier}{release_tag}_{deid_stage}" @@ -224,7 +224,7 @@ def create_tier(credentials_filepath, project_id, tier, input_dataset, qa_handoff_date, '--etl_version', versions[0] ]) - if tier == 'controlled': + if 'controlled' in tier: bq_client.copy_table( f'{project_id}.{PIPELINE_TABLES}.{ZIP3_SES_MAP}', f'{project_id}.{datasets[consts.STAGING]}.{ZIP3_SES_MAP}')