Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nph (do not merge) #1865

Draft
wants to merge 16 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,146 +1,37 @@
table_name,column_name,data_type,is_nullable,is_suppressed,rule
condition_occurrence,condition_source_value,STRING,YES,YES,DC-1370
condition_occurrence,condition_status_source_value,STRING,YES,YES,DC-1370
condition_occurrence,stop_reason,STRING,YES,YES,DC-1370
death,cause_source_value,STRING,YES,YES,DC-1370
device_exposure,device_source_value,STRING,YES,YES,DC-1370
device_exposure,unique_device_id,STRING,YES,YES,DC-1370
drug_exposure,dose_unit_source_value,STRING,YES,YES,DC-1370
drug_exposure,drug_source_value,STRING,YES,YES,DC-1370
drug_exposure,lot_number,STRING,YES,YES,DC-1370
drug_exposure,route_source_value,STRING,YES,YES,DC-1370
drug_exposure,sig,STRING,YES,YES,DC-1370
drug_exposure,stop_reason,STRING,YES,YES,DC-1370
measurement,measurement_source_value,STRING,YES,YES,DC-1370
measurement,unit_source_value,STRING,YES,YES,DC-1370
measurement,value_source_value,STRING,YES,YES,DC-1370
note,note_source_value,STRING,YES,YES,DC-1370
note,note_text,STRING,NO,YES,DC-1370
note,note_title,STRING,NO,YES,DC-1370
observation,observation_source_value,STRING,YES,YES,DC-1370
observation,qualifier_source_value,STRING,YES,YES,DC-1370
observation,unit_source_value,STRING,YES,YES,DC-1370
observation,value_as_string,STRING,YES,YES,DC-1370
observation,value_source_value,STRING,YES,YES,DC-1370
person,ethnicity_source_value,STRING,YES,YES,DC-1370
person,gender_source_value,STRING,YES,YES,DC-1370
person,person_source_value,STRING,YES,YES,DC-1370
person,race_source_value,STRING,YES,YES,DC-1370
procedure_occurrence,procedure_source_value,STRING,YES,YES,DC-1370
procedure_occurrence,modifier_source_value,STRING,YES,YES,DC-1370
specimen,anatomic_site_source_value,STRING,YES,YES,DC-1370
specimen,disease_status_source_value,STRING,YES,YES,DC-1370
specimen,specimen_source_id,STRING,YES,YES,DC-1370
specimen,specimen_source_value,STRING,YES,YES,DC-1370
specimen,unit_source_value,STRING,YES,YES,DC-1370
visit_occurrence,admitting_source_value,STRING,YES,YES,DC-1370
visit_occurrence,discharge_to_source_value,STRING,YES,YES,DC-1370
visit_occurrence,visit_source_value,STRING,YES,YES,DC-1370
person,month_of_birth,INT64,YES,YES,DC-1373
person,day_of_birth,INT64,YES,YES,DC-1373
person,birth_datetime,TIMESTAMP,YES,YES,DC-1373
person,location_id,INT64,YES,YES,DC-1373
person,provider_id,INT64,YES,YES,DC-1373
person,care_site_id,INT64,YES,YES,DC-1373
observation,provider_id,INT64,YES,YES,DC-1373
visit_occurrence,provider_id,INT64,YES,YES,DC-1373
visit_occurrence,care_site_id,INT64,YES,YES,DC-1373
care_site_ext,care_site_id,INT64,YES,YES,DC-1373
measurement,provider_id,INT64,YES,YES,DC-1373
provider,provider_id,INT64,NO,YES,DC-1373
provider,care_site_id,INT64,YES,YES,DC-1373
procedure_occurrence,provider_id,INT64,YES,YES,DC-1373
care_site,care_site_id,INT64,NO,YES,DC-1373
care_site,location_id,INT64,YES,YES,DC-1373
device_exposure,provider_id,INT64,YES,YES,DC-1373
location_ext,location_id,INT64,YES,YES,DC-1373
location,location_id,INT64,NO,YES,DC-1373
condition_occurrence,provider_id,INT64,YES,YES,DC-1373
drug_exposure,provider_id,INT64,YES,YES,DC-1373
note,provider_id,INT64,YES,YES,DC-1373
provider_ext,provider_id,INT64,YES,YES,DC-1373
person,month_of_birth,INT64,YES,YES,DC-1357
person,day_of_birth,INT64,YES,YES,DC-1357
person,birth_datetime,TIMESTAMP,YES,YES,DC-1357
observation,observation_concept_id,INT64,NO,,DC-1368
observation,observation_type_concept_id,INT64,NO,,DC-1368
observation,value_as_concept_id,INT64,YES,,DC-1368
observation,qualifier_concept_id,INT64,YES,,DC-1368
observation,unit_concept_id,INT64,YES,,DC-1368
observation,observation_source_concept_id,INT64,YES,,DC-1368
observation,value_source_concept_id,INT64,YES,,DC-1368
drug_era,drug_concept_id,INT64,NO,,DC-1368
fact_relationship,relationship_concept_id,INT64,NO,,DC-1368
observation_period,period_type_concept_id,INT64,NO,,DC-1368
procedure_cost,currency_concept_id,INT64,YES,,DC-1368
procedure_cost,revenue_code_concept_id,INT64,YES,,DC-1368
visit_occurrence,visit_concept_id,INT64,NO,,DC-1368
visit_occurrence,visit_type_concept_id,INT64,NO,,DC-1368
visit_occurrence,visit_source_concept_id,INT64,YES,,DC-1368
visit_occurrence,admitting_source_concept_id,INT64,YES,,DC-1368
visit_occurrence,discharge_to_concept_id,INT64,YES,,DC-1368
drug_strength,drug_concept_id,INT64,NO,,DC-1368
drug_strength,ingredient_concept_id,INT64,NO,,DC-1368
drug_strength,amount_unit_concept_id,INT64,YES,,DC-1368
drug_strength,numerator_unit_concept_id,INT64,YES,,DC-1368
drug_strength,denominator_unit_concept_id,INT64,YES,,DC-1368
condition_era,condition_concept_id,INT64,NO,,DC-1368
measurement,measurement_concept_id,INT64,NO,,DC-1368
measurement,measurement_type_concept_id,INT64,NO,,DC-1368
measurement,operator_concept_id,INT64,YES,,DC-1368
measurement,value_as_concept_id,INT64,YES,,DC-1368
measurement,unit_concept_id,INT64,YES,,DC-1368
measurement,measurement_source_concept_id,INT64,YES,,DC-1368
visit_cost,currency_concept_id,INT64,YES,,DC-1368
provider,specialty_concept_id,INT64,YES,,DC-1368
provider,gender_concept_id,INT64,YES,,DC-1368
provider,specialty_source_concept_id,INT64,YES,,DC-1368
provider,gender_source_concept_id,INT64,YES,,DC-1368
person,gender_concept_id,INT64,NO,,DC-1368
person,race_concept_id,INT64,NO,,DC-1368
person,ethnicity_concept_id,INT64,NO,,DC-1368
person,gender_source_concept_id,INT64,YES,,DC-1368
person,race_source_concept_id,INT64,YES,,DC-1368
person,ethnicity_source_concept_id,INT64,YES,,DC-1368
drug_cost,currency_concept_id,INT64,YES,,DC-1368
cohort_attribute,value_as_concept_id,INT64,YES,,DC-1368
procedure_occurrence,procedure_concept_id,INT64,NO,,DC-1368
procedure_occurrence,procedure_type_concept_id,INT64,NO,,DC-1368
procedure_occurrence,modifier_concept_id,INT64,YES,,DC-1368
procedure_occurrence,procedure_source_concept_id,INT64,YES,,DC-1368
care_site,place_of_service_concept_id,INT64,YES,,DC-1368
specimen,specimen_concept_id,INT64,NO,,DC-1368
specimen,specimen_type_concept_id,INT64,NO,,DC-1368
specimen,unit_concept_id,INT64,YES,,DC-1368
specimen,anatomic_site_concept_id,INT64,YES,,DC-1368
specimen,disease_status_concept_id,INT64,YES,,DC-1368
death,death_type_concept_id,INT64,NO,,DC-1368
death,cause_concept_id,INT64,YES,,DC-1368
death,cause_source_concept_id,INT64,YES,,DC-1368
device_exposure,device_concept_id,INT64,NO,,DC-1368
device_exposure,device_type_concept_id,INT64,NO,,DC-1368
device_exposure,device_source_concept_id,INT64,YES,,DC-1368
device_cost,currency_concept_id,INT64,YES,,DC-1368
condition_occurrence,condition_concept_id,INT64,NO,,DC-1368
condition_occurrence,condition_type_concept_id,INT64,NO,,DC-1368
condition_occurrence,condition_source_concept_id,INT64,YES,,DC-1368
condition_occurrence,condition_status_concept_id,INT64,YES,,DC-1368
cohort_definition,definition_type_concept_id,INT64,NO,,DC-1368
cohort_definition,subject_concept_id,INT64,NO,,DC-1368
attribute_definition,attribute_type_concept_id,INT64,NO,,DC-1368
drug_exposure,drug_concept_id,INT64,NO,,DC-1368
drug_exposure,drug_type_concept_id,INT64,NO,,DC-1368
drug_exposure,route_concept_id,INT64,YES,,DC-1368
drug_exposure,drug_source_concept_id,INT64,YES,,DC-1368
note,note_type_concept_id,INT64,NO,,DC-1368
note,note_class_concept_id,INT64,NO,,DC-1368
note,encoding_concept_id,INT64,NO,,DC-1368
note,language_concept_id,INT64,NO,,DC-1368
cost,cost_type_concept_id,INT64,NO,,DC-1368
cost,currency_concept_id,INT64,YES,,DC-1368
cost,revenue_code_concept_id,INT64,YES,,DC-1368
dose_era,drug_concept_id,INT64,NO,,DC-1368
dose_era,unit_concept_id,INT64,NO,,DC-1368
observation,observation_concept_id,INT64,NO,,DC-1388
observation,observation_type_concept_id,INT64,NO,,DC-1388
observation,value_as_concept_id,INT64,YES,,DC-1388
Expand Down
Original file line number Diff line number Diff line change
@@ -1,29 +1 @@
table_name,column_name,data_type,primary_key,mapping_table,new_id,rule
observation,person_id,INT64,observation_id,_deid_map,research_id,DC-1346
drug_era,person_id,INT64,drug_era_id,_deid_map,research_id,DC-1346
observation_period,person_id,INT64,observation_period_id,_deid_map,research_id,DC-1346
visit_occurrence,person_id,INT64,visit_occurrence_id,_deid_map,research_id,DC-1346
measurement,person_id,INT64,measurement_id,_deid_map,research_id,DC-1346
person,person_id,INT64,person_id,_deid_map,research_id,DC-1346
procedure_occurrence,person_id,INT64,procedure_occurrence_id,_deid_map,research_id,DC-1346
specimen,person_id,INT64,specimen_id,_deid_map,research_id,DC-1346
device_exposure,person_id,INT64,device_exposure_id,_deid_map,research_id,DC-1346
payer_plan_period,person_id,INT64,payer_plan_period_id,_deid_map,research_id,DC-1346
condition_occurrence,person_id,INT64,condition_occurrence_id,_deid_map,research_id,DC-1346
drug_exposure,person_id,INT64,drug_exposure_id,_deid_map,research_id,DC-1346
note,person_id,INT64,note_id,_deid_map,research_id,DC-1346
dose_era,person_id,INT64,dose_era_id,_deid_map,research_id,DC-1346
survey_conduct,person_id,INT64,survey_conduct_id,_deid_map,research_id,DC-1346
observation,questionnaire_response_id,INT64,observation_id,_deid_questionnaire_response_map,research_response_id,DC-1348
survey_conduct,survey_conduct_id,INT64,survey_conduct_id,_deid_questionnaire_response_map,research_response_id,DC-1348
survey_conduct,survey_source_identifier,STRING,survey_conduct_id,_deid_questionnaire_response_map,research_response_id,DC-1348
observation,value_as_string,STRING,observation_id,_zip_map,,DC-1377
care_site_ext,src_id,STRING,care_site_id,site_maskings,src_id,DC-1355
visit_occurrence_ext,src_id,STRING,visit_occurrence_id,site_maskings,src_id,DC-1355
procedure_occurrence_ext,src_id,STRING,procedure_occurrence_id,site_maskings,src_id,DC-1355
drug_exposure_ext,src_id,STRING,drug_exposure_id,site_maskings,src_id,DC-1355
provider_ext,src_id,STRING,provider_id,site_maskings,src_id,DC-1355
device_exposure_ext,src_id,STRING,device_exposure_id,site_maskings,src_id,DC-1355
condition_occurrence_ext,src_id,STRING,condition_occurrence_id,site_maskings,src_id,DC-1355
observation_ext,src_id,STRING,observation_id,site_maskings,src_id,DC-1355
measurement_ext,src_id,STRING,measurement_id,site_maskings,src_id,DC-1355
Original file line number Diff line number Diff line change
@@ -1,5 +1,2 @@
table_name,rule
note,DC-1362
location,DC-1362
care_site,DC-1362
provider,DC-1362

50 changes: 28 additions & 22 deletions data_steward/cdr_cleaner/clean_cdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
from cdr_cleaner.cleaning_rules.convert_pre_post_coordinated_concepts import ConvertPrePostCoordinatedConcepts
from cdr_cleaner.cleaning_rules.create_aian_lookup import CreateAIANLookup
from cdr_cleaner.cleaning_rules.create_expected_ct_list import StoreExpectedCTList
from cdr_cleaner.cleaning_rules.deid.ct_additional_privacy_suppression import CTAdditionalPrivacyConceptSuppression
from cdr_cleaner.cleaning_rules.deid.ct_observation_privacy_suppression import CTObservationPrivacySuppression
from cdr_cleaner.cleaning_rules.deid.ct_nph_observation_privacy_suppression import CTNPHObservationPrivacySuppression
from cdr_cleaner.cleaning_rules.deid.rt_additional_privacy_suppression import RTAdditionalPrivacyConceptSuppression
from cdr_cleaner.cleaning_rules.deid.rt_observation_privacy_suppression import RTObservationPrivacySuppression
Expand Down Expand Up @@ -354,42 +352,48 @@
(FitbitDateShiftRule,),
]

CONTROLLED_TIER_DEID_CLEANING_CLASSES = [
(RtCtPIDtoRID,),
(QRIDtoRID,), # Should run before any row suppression rules
(TruncateEraTables,),
(NullPersonBirthdate,),
(TableSuppression,),
NPH_CONTROLLED_TIER_DEID_CLEANING_CLASSES = [
(CleanPPINumericFieldsUsingParameters,),
(ControlledTierReplacedConceptSuppression,),
(GeneralizeZipCodes,), # Should run after any data remapping rules
# (RaceEthnicityRecordSuppression,), # Should run after any data remapping rules
(RaceEthnicityRecordSuppression,), # Should run after any data remapping rules
(
MotorVehicleAccidentSuppression,),
(VehicularAccidentConceptSuppression,),
(ExplicitIdentifierSuppression,),
(GeoLocationConceptSuppression,),
(BirthInformationSuppression,),
(YearOfBirthRecordsSuppression,),
(ControlledCopeSurveySuppression,),
(IDFieldSuppression,), # Should run after any data remapping
(CancerConceptSuppression,), # Should run after any data remapping rules
(SectionParticipationConceptSuppression,),
(CTAdditionalPrivacyConceptSuppression,),
(CTObservationPrivacySuppression,),
# (CTNPHObservationPrivacySuppression,), # Applies only to NPH data
(CTNPHObservationPrivacySuppression,), # Applies only to NPH data. will be dealt with in 2.0 when handling row suppressions
(
StringFieldsSuppression,),
(AggregateZipCodes,),
(DeidentifyAIANZip3Values,),
(FreeTextSurveyResponseSuppression,),
(DropOrphanedSurveyConductIds,),
(DropOrphanedPIDS,),
(GenerateWearStudyTable,),
(DropViaSurveyConduct,), # should run after wear study table creation
(RemoveExtraTables,), # Should be last cleaning rule to be run
(CalculatePrimaryDeathRecord,),
(CleanMappingExtTables,), # should be one of the last cleaning rules run
]
(FillSourceValueTextFields,),
]

CONTROLLED_TIER_DEID_CLEANING_CLASSES = [
(RtCtPIDtoRID,),
(ControlledTierReplacedConceptSuppression,),
(GeneralizeZipCodes,), # Should run after any data remapping rules
(RaceEthnicityRecordSuppression,), # Should run after any data remapping rules
(MotorVehicleAccidentSuppression,),
(VehicularAccidentConceptSuppression,),
(ExplicitIdentifierSuppression,),
(GeoLocationConceptSuppression,),
(BirthInformationSuppression,),
(YearOfBirthRecordsSuppression,),
(IDFieldSuppression,), # Should run after any data remapping
(CancerConceptSuppression,), # Should run after any data remapping rules
(SectionParticipationConceptSuppression,),
(StringFieldsSuppression,),
(AggregateZipCodes,),
(DeidentifyAIANZip3Values,),
(FreeTextSurveyResponseSuppression,)
]

CONTROLLED_TIER_DEID_BASE_CLEANING_CLASSES = [
(FillSourceValueTextFields,),
Expand Down Expand Up @@ -451,6 +455,8 @@
REGISTERED_TIER_FITBIT_CLEANING_CLASSES,
DataStage.CONTROLLED_TIER_DEID.value:
CONTROLLED_TIER_DEID_CLEANING_CLASSES,
DataStage.NPH_CONTROLLED_TIER_DEID.value:
NPH_CONTROLLED_TIER_DEID_CLEANING_CLASSES,
DataStage.CONTROLLED_TIER_DEID_BASE.value:
CONTROLLED_TIER_DEID_BASE_CLEANING_CLASSES,
DataStage.CONTROLLED_TIER_DEID_CLEAN.value:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@
END AS
value_source_concept_id,
value_source_value,
questionnaire_response_id
-- questionnaire_response_id --
-- an NPH specific changes that needs to be addressed before 1.0 executions are run again. --
form_name
FROM
{{project}}.{{dataset}}.observation""")

Expand Down Expand Up @@ -159,7 +161,7 @@ def __init__(self, project_id, dataset_id, sandbox_dataset_id):
'DC1058', 'DC1061', 'DC827', 'DC502', 'DC487', 'DC2475', 'DC2649'
],
description=desc,
affected_datasets=[cdr_consts.RDR],
affected_datasets=[cdr_consts.RDR, cdr_consts.NPH_CONTROLLED_TIER_DEID],
affected_tables=['observation'],
project_id=project_id,
dataset_id=dataset_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
FROM `{{project}}.{{dataset}}.{{domain_table}}` AS d
LEFT JOIN `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` AS s
ON d.{{domain_table}}_id = s.{{domain_table}}_id
WHERE d.{{field_name}} = {{field_value}} and s.{{domain_table}}_id IS NULL
WHERE d.{{field_name}} in ({{field_value}}) and s.{{domain_table}}_id IS NULL
""")

STRING_FIELD_SUPPRESSION_QUERY_TEMPLATE = JINJA_ENV.from_string("""
Expand All @@ -56,7 +56,7 @@
{% endfor %}
FROM `{{project}}.{{sandbox_dataset}}.{{sandbox_table}}` AS s
WHERE d.{{domain_table}}_id = s.{{domain_table}}_id
AND s.{{field_name}} = {{field_value}}
AND s.{{field_name}} in ({{field_value}})
""")

VALIDATION_QUERY_TEMPLATE = JINJA_ENV.from_string("""
Expand Down Expand Up @@ -107,7 +107,8 @@ def get_string_fields(domain_table):
else:
fields = [
field for field in resources.fields_for(domain_table)
if field['type'] == 'string'
if field['type'] == 'string' and
field['name'] not in ['form_name'] # NPH release includes form_name
]

return fields
Expand Down Expand Up @@ -210,7 +211,16 @@ def get_rule_exceptions(self) -> List[SuppressionException]:
sandbox_table=self.sandbox_table_for(OBSERVATION),
field_name=OBSERVATION_SOURCE_CONCEPT_ID,
field_value=APPROXIMATE_DATE_OF_SYMPTOMS,
restore_fields=[VALUE_AS_STRING])
restore_fields=[VALUE_AS_STRING]),
SuppressionException(
domain_table=OBSERVATION,
sandbox_table=self.sandbox_table_for(OBSERVATION),
field_name=OBSERVATION_SOURCE_CONCEPT_ID,
field_value=(f"SELECT concept_id "
f"FROM `{self.project_id}.{self.dataset_id}.concept` "
f"WHERE concept_code in ('cgm_dev_given_id', 'polar_id', 'poloar_actigraph_pair_id', 'device_notcollected', 'scr_studyid', 'profile_zipcode', 'geo_curr_hmzipcode', 'geo_curr_hmcountry')"),
restore_fields=[VALUE_AS_STRING]
)
]

def get_query_specs(self, *args, **keyword_args) -> query_spec_list:
Expand Down
Loading