From 558f68918e8f785bdb394b6fd202d6d33b9a428e Mon Sep 17 00:00:00 2001 From: Aleksandr Medvedev Date: Thu, 23 Dec 2021 17:18:09 +0000 Subject: [PATCH 1/3] Fixed ibd1 ibd2 and shared genome proportions calculation --- rules/relatives_ibis.smk | 2 +- rules/relatives_ibis_king.smk | 1 - scripts/merge_king_ersa.py | 51 ++++++++++----------- scripts/postprocess_ersa.py | 23 +++++++--- workflows/bundle/Snakefile | 84 +++++++++++++++++++++++------------ 5 files changed, 98 insertions(+), 63 deletions(-) diff --git a/rules/relatives_ibis.smk b/rules/relatives_ibis.smk index 6ae56870..cfcb8354 100644 --- a/rules/relatives_ibis.smk +++ b/rules/relatives_ibis.smk @@ -64,7 +64,7 @@ rule ersa: FILES="{input.ibd}" TEMPFILE=ersa/temp_relatives.tsv - + rm $TEMPFILE for input_file in $FILES; do ersa --avuncular-adj -ci --alpha {params.alpha} --dmax 14 -t $ERSA_T -l $ERSA_L -th $ERSA_TH $input_file -o $TEMPFILE |& tee {log} cat $TEMPFILE >> {output} diff --git a/rules/relatives_ibis_king.smk b/rules/relatives_ibis_king.smk index be026ea5..15119d99 100644 --- a/rules/relatives_ibis_king.smk +++ b/rules/relatives_ibis_king.smk @@ -126,7 +126,6 @@ rule merge_king_ersa: input: king=rules.run_king.output['king'], king_segments=rules.run_king.output['segments'], - bucket_dir=directory('ibd'), ersa=rules.ersa.output[0], kinship=rules.run_king.output['kinship'], kinship0=rules.run_king.output['kinship0'], diff --git a/scripts/merge_king_ersa.py b/scripts/merge_king_ersa.py index 6d4fc9f8..209a9ed9 100644 --- a/scripts/merge_king_ersa.py +++ b/scripts/merge_king_ersa.py @@ -95,8 +95,8 @@ def read_king(king_path): data.loc[:, 'king_degree'] = map_king_degree(data.InfType) data.loc[:, 'king_relation'] = map_king_relation(data.InfType) data.loc[:, 'king_degree'] = data.king_degree.astype(float).astype(pandas.Int32Dtype()) - data.rename({'PropIBD': 'shared_genome_proportion'}, axis='columns', inplace=True) - indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'shared_genome_proportion']].\ + data.rename({'PropIBD': 'shared_genome_proportion', 'IBD1Seg': 'king_ibd1_prop', 'IBD2Seg': 'king_ibd2_prop'}, axis='columns', inplace=True) + indexed = data.loc[:, ['id1', 'id2', 'king_degree', 'king_relation', 'king_ibd1_prop', 'king_ibd2_prop', 'shared_genome_proportion']].\ set_index(['id1', 'id2']) return indexed @@ -106,6 +106,8 @@ def read_king(king_path): 'id2', 'king_degree', 'king_relation', + 'king_ibd1_prop', + 'king_ibd2_prop', 'shared_genome_proportion']).set_index(['id1', 'id2']) @@ -228,8 +230,8 @@ def read_ersa(ersa_path): # Indv_1 Indv_2 Rel_est1 Rel_est2 d_est lower_d upper_d N_seg Tot_cM data = pandas.read_table(ersa_path, header=0, names=['id1', 'id2', 'rel_est1', 'rel_est2', - 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count', 'total_seg_len'], - dtype={'ersa_degree': str}) + 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'seg_count_ersa', 'total_seg_len_ersa'], + dtype={'ersa_degree': str, 'seg_count_ersa': str, 'total_seg_len_ersa': str}) data = data.loc[(data.rel_est1 != 'NA') | (data.rel_est2 != 'NA'), :] data.loc[:, 'id1'] = data.id1.str.strip() @@ -241,18 +243,20 @@ def read_ersa(ersa_path): pandas.Int32Dtype()) data.loc[:, 'ersa_upper_bound'] = pandas.to_numeric(data.ersa_upper_bound.str.strip(), errors='coerce').astype( pandas.Int32Dtype()) + print('dtypes are: ', data.dtypes) + data.loc[:, 'seg_count_ersa'] = pandas.to_numeric(data.seg_count_ersa.str.strip(), errors='coerce').astype( + pandas.Int32Dtype()) + data.loc[:, 'total_seg_len_ersa'] = pandas.to_numeric(data.total_seg_len_ersa.str.strip(), errors='coerce') data.loc[data.ersa_lower_bound != 1, 'ersa_lower_bound'] = data.ersa_lower_bound - 1 data.loc[data.ersa_upper_bound != 1, 'ersa_upper_bound'] = data.ersa_upper_bound - 1 - print(f'Bad degrees are {(data.ersa_lower_bound > data.ersa_degree).sum()}') - print(Counter(data.ersa_degree[data.ersa_lower_bound > data.ersa_degree])) logging.info(f'read {data.shape[0]} pairs from ersa output') logging.info(f'ersa after reading has {pandas.notna(data.ersa_degree).sum()}') data.loc[:, 'is_niece_aunt'] = [True if 'Niece' in d else False for d in data.rel_est1] logging.info(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships') - print(f'we have total of {data.is_niece_aunt.sum()} possible Niece/Aunt relationships') - return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt']].\ + + return data.loc[data.id1 != data.id2, ['id1', 'id2', 'ersa_degree', 'ersa_lower_bound', 'ersa_upper_bound', 'is_niece_aunt', 'seg_count_ersa', 'total_seg_len_ersa']].\ set_index(['id1', 'id2']) @@ -294,8 +298,7 @@ def read_ersa2(ersa_path): test_dir = '/media_ssd/pipeline_data/TF-CEU-TRIBES-ibis-king-2' Snakemake = namedtuple('Snakemake', ['input', 'output', 'params', 'log']) snakemake = Snakemake( - input={'bucket_dir': f'{test_dir}/ibd', - 'king': f'{test_dir}/king/data.seg', + input={'king': f'{test_dir}/king/data.seg', 'king_segments': f'{test_dir}/king/data.segments.gz', 'kinship': f'{test_dir}/king/data.kin', 'kinship0': f'{test_dir}/king/data.kin0', @@ -308,7 +311,6 @@ def read_ersa2(ersa_path): logging.basicConfig(filename=snakemake.log[0], level=logging.DEBUG, format='%(levelname)s:%(asctime)s %(message)s') - ibd_path = snakemake.input['bucket_dir'] king_path = snakemake.input['king'] king_segments_path = snakemake.input['king_segments'] # within families @@ -319,20 +321,14 @@ def read_ersa2(ersa_path): map_dir = snakemake.params['cm_dir'] output_path = snakemake.output[0] - ibd = read_bucket_dir(ibd_path) king = read_king(king_path) kinship = read_kinship(kinship_path, kinship0_path) king_segments = read_king_segments_chunked(king_segments_path, map_dir) ersa = read_ersa(ersa_path) - logging.info(f'ibd shape: {ibd.shape[0]}, ersa shape: {ersa.shape[0]}') - # print('ibd test:', ibd[('GRC12118091', 'GRC12118096')]) - # print('ibd test2:', ibd[('GRC12118096', 'GRC12118091')]) - - relatives = ibd.merge(king, how='outer', left_index=True, right_index=True).\ - merge(kinship, how='outer', left_index=True, right_index=True).\ - merge(ersa, how='outer', left_index=True, right_index=True).\ - merge(king_segments, how='outer', left_index=True, right_index=True) + relatives = king.merge(kinship, how='outer', left_index=True, right_index=True).\ + merge(ersa, how='outer', left_index=True, right_index=True).\ + merge(king_segments, how='outer', left_index=True, right_index=True) prefer_ersa_mask = pandas.isnull(relatives.king_degree) | (relatives.king_degree > 3) relatives.loc[:, 'final_degree'] = relatives.king_degree @@ -340,17 +336,18 @@ def read_ersa2(ersa_path): relatives.loc[prefer_ersa_mask, 'final_degree'] = relatives.ersa_degree logging.info(f'king is null or more than 3: {prefer_ersa_mask.sum()}') logging.info(f'ersa is not null: {pandas.notna(relatives.ersa_degree).sum()}') - + print(f'relatives columns are {relatives.columns}') if 'total_seg_len_king' in relatives.columns: - relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king + relatives.loc[:, 'total_seg_len'] = relatives.total_seg_len_king*relatives.king_ibd1_prop + relatives.loc[:, 'total_seg_len_ibd2'] = relatives.total_seg_len_king*relatives.king_ibd2_prop relatives.loc[:, 'seg_count'] = relatives.seg_count_king - relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_germline - relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_germline - + relatives.loc[prefer_ersa_mask, 'total_seg_len'] = relatives.total_seg_len_ersa + relatives.loc[prefer_ersa_mask, 'seg_count'] = relatives.seg_count_ersa + print('is na: ', pandas.isna(relatives.loc[prefer_ersa_mask, 'total_seg_len_ersa']).sum()) # approximate calculations, IBD share is really small in this case - relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'] / 3540 - relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_germline', 'seg_count_germline'], + relatives.loc[prefer_ersa_mask, 'shared_genome_proportion'] = 0.5*relatives.loc[prefer_ersa_mask, 'total_seg_len'].values / 3440 + relatives.drop(['total_seg_len_king', 'seg_count_king', 'total_seg_len_ersa', 'seg_count_ersa', 'king_ibd1_prop', 'king_ibd2_prop'], axis='columns', inplace=True) logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}') diff --git a/scripts/postprocess_ersa.py b/scripts/postprocess_ersa.py index 3e677438..67f7d956 100644 --- a/scripts/postprocess_ersa.py +++ b/scripts/postprocess_ersa.py @@ -95,27 +95,38 @@ def read_ersa(ersa_path): relatives = ibd.merge(ersa, how='outer', left_index=True, right_index=True) # It is impossible to have more than 50% of ibd2 segments unless you are monozygotic twins or duplicates. - dup_mask = relatives.total_seg_len_ibd2 > 1800 - fs_mask = (relatives.total_seg_len > 2100) & (relatives.total_seg_len_ibd2 > 450) & (~dup_mask) - po_mask = (relatives.total_seg_len / 3540 > 0.8) & (~fs_mask) & (~dup_mask) + GENOME_CM_LEN = 3440 + DUPLICATES_THRESHOLD = 1750 + FS_TOTAL_THRESHOLD = 2100 + FS_IBD2_THRESHOLD = 450 + dup_mask = relatives.total_seg_len_ibd2 > DUPLICATES_THRESHOLD + fs_mask = (relatives.total_seg_len > FS_TOTAL_THRESHOLD) & (relatives.total_seg_len_ibd2 > FS_IBD2_THRESHOLD) & (~dup_mask) + po_mask = (relatives.total_seg_len / GENOME_CM_LEN > 0.8) & (~fs_mask) & (~dup_mask) print(f'We have {sum(dup_mask)} duplicates, {sum(fs_mask)} full siblings and {sum(po_mask)} parent-offspring relationships') relatives.loc[:, 'final_degree'] = relatives.ersa_degree relatives.loc[dup_mask, 'final_degree'] = 0 relatives.loc[po_mask, 'final_degree'] = 1 - relatives.loc[(~po_mask) & (relatives.final_degree == 1)] = 2 # to eliminate some ersa false positives + relatives.loc[(~po_mask) & (relatives.final_degree == 1), 'final_degree'] = 2 # to eliminate some ersa false positives relatives.loc[fs_mask, 'final_degree'] = 2 relatives.loc[:, 'relation'] = relatives.ersa_degree.astype(str) relatives.loc[po_mask, 'relation'] = 'PO' relatives.loc[fs_mask, 'relation'] = 'FS' relatives.loc[dup_mask, 'relation'] = 'MZ/Dup' - # if king is unsure or king degree > 3 then we use ersa distant relatives estimation # approximate calculations, IBD share is really small in this case relatives.loc[pandas.isna(relatives.total_seg_len_ibd2), 'total_seg_len_ibd2'] = 0 relatives.loc[pandas.isna(relatives.seg_count_ibd2), 'seg_count_ibd2'] = 0 - relatives.loc[:, 'shared_genome_proportion'] = 0.5*relatives.total_seg_len / 3540 + relatives.total_seg_len_ibd2 / 3540 + ''' + IBIS and ERSA do not distinguish IBD1 and IBD2 segments + shared_genome_proportion is a proportion of identical alleles + i.e. shared_genome_proportion of [(0/0), (0/1)] and [(0/0), (1/1)] should be 3/4 + GENOME_SEG_LEN is length of centimorgans of the haplotype + ibd2 segments span two haplotypes, therefore their length should be doubled + total_seg_len includes both ibd1 and ibd2 segments length + ''' + relatives.loc[:, 'shared_genome_proportion'] = (relatives.total_seg_len - relatives.total_seg_len_ibd2) / (2*GENOME_CM_LEN) + relatives.total_seg_len_ibd2*2 / (2*GENOME_CM_LEN) relatives.loc[relatives.shared_genome_proportion > 1.0, 'shared_genome_proportion'] = 1.0 logging.info(f'final degree not null: {pandas.notna(relatives.final_degree).sum()}') relatives.loc[pandas.notna(relatives.final_degree), :].to_csv(output_path, sep='\t') diff --git a/workflows/bundle/Snakefile b/workflows/bundle/Snakefile index d3d4b3ea..31141354 100644 --- a/workflows/bundle/Snakefile +++ b/workflows/bundle/Snakefile @@ -24,31 +24,59 @@ BUNDLE_md5 = config["reference"]["bundle" if full else "bundle_min"]["md CHROMOSOMES = [str(i) for i in range(1, 23)] -rule all: - output: - GRCh37_fasta = GRCH37_FASTA, - GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA), - GENETIC_MAP = GENETIC_MAP, - genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES), - cmmap = expand(CMMAP, chrom=CHROMOSOMES), - lift_chain = LIFT_CHAIN, - SITE_1000GENOME = SITE_1000GENOME, - pedsim_map = PEDSIM_MAP, - affymetrix_chip = AFFYMETRIX_CHIP, - vcfRef = expand(REF_VCF, chrom=CHROMOSOMES if full else []), - refHaps = expand(REF_HAPS, chrom=CHROMOSOMES if full else []) - conda: - "../../envs/download.yaml" - log: - "logs/ref/download_bundle.log" - shell: - """ - wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log} - sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1) - if [ $sum != {BUNDLE_md5} ]; then - echo "md5 sum of BUDNLE is invalid" |& tee -a {log} - exit 1 - fi - tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log} - rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log} - """ \ No newline at end of file +if full: + rule all: + output: + GRCh37_fasta = GRCH37_FASTA, + GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA), + GENETIC_MAP = GENETIC_MAP, + genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES), + cmmap = expand(CMMAP, chrom=CHROMOSOMES), + lift_chain = LIFT_CHAIN, + SITE_1000GENOME = SITE_1000GENOME, + pedsim_map = PEDSIM_MAP, + affymetrix_chip = AFFYMETRIX_CHIP, + vcfRef = expand(REF_VCF, chrom=CHROMOSOMES), + refHaps = expand(REF_HAPS, chrom=CHROMOSOMES) + conda: + "../../envs/download.yaml" + log: + "logs/ref/download_bundle.log" + shell: + """ + wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log} + sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1) + if [ $sum != {BUNDLE_md5} ]; then + echo "md5 sum of BUDNLE is invalid" |& tee -a {log} + exit 1 + fi + tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log} + rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log} + """ +else: + rule all: + output: + GRCh37_fasta = GRCH37_FASTA, + GRCh37_fasta_dict = expand("{ref}.dict",ref=GRCH37_FASTA), + GENETIC_MAP = GENETIC_MAP, + genetic_map_GRCh37 = expand(GENETIC_MAP_GRCH37, chrom=CHROMOSOMES), + cmmap = expand(CMMAP, chrom=CHROMOSOMES), + lift_chain = LIFT_CHAIN, + SITE_1000GENOME = SITE_1000GENOME, + pedsim_map = PEDSIM_MAP, + affymetrix_chip = AFFYMETRIX_CHIP + conda: + "../../envs/download.yaml" + log: + "logs/ref/download_bundle.log" + shell: + """ + wget "{BUNDLE_url}{AZURE_KEY}" -O {REF_DIR}/{BUNDLE_basename} --tries 50 |& tee -a {log} + sum=$(md5sum {REF_DIR}/{BUNDLE_basename} | cut -d " " -f1) + if [ $sum != {BUNDLE_md5} ]; then + echo "md5 sum of BUDNLE is invalid" |& tee -a {log} + exit 1 + fi + tar -xzvf {REF_DIR}/{BUNDLE_basename} -C {REF_DIR} |& tee -a {log} + rm -f {REF_DIR}/{BUNDLE_basename} |& tee -a {log} + """ \ No newline at end of file From 2bd8e831a7842a41d0e80fc47ffe0aaea21a3888 Mon Sep 17 00:00:00 2001 From: Aleksandr Medvedev Date: Fri, 24 Dec 2021 12:51:47 +0000 Subject: [PATCH 2/3] Removed unnecessary ersa results header --- rules/relatives_ibis.smk | 12 ++++++++++-- rules/relatives_ibis_king.smk | 12 ++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/rules/relatives_ibis.smk b/rules/relatives_ibis.smk index cfcb8354..ef18c100 100644 --- a/rules/relatives_ibis.smk +++ b/rules/relatives_ibis.smk @@ -64,10 +64,18 @@ rule ersa: FILES="{input.ibd}" TEMPFILE=ersa/temp_relatives.tsv - rm $TEMPFILE + rm -f $TEMPFILE + rm -f {output} + for input_file in $FILES; do + ersa --avuncular-adj -ci --alpha {params.alpha} --dmax 14 -t $ERSA_T -l $ERSA_L -th $ERSA_TH $input_file -o $TEMPFILE |& tee {log} - cat $TEMPFILE >> {output} + + if [[ "$input_file" == "${{FILES[0]}}" ]]; then + cat $TEMPFILE >> {output} + else + sed 1d $TEMPFILE >> {output} + fi done """ diff --git a/rules/relatives_ibis_king.smk b/rules/relatives_ibis_king.smk index 15119d99..e37ffe5a 100644 --- a/rules/relatives_ibis_king.smk +++ b/rules/relatives_ibis_king.smk @@ -104,10 +104,18 @@ rule ersa: FILES="{input.ibd}" TEMPFILE=ersa/temp_relatives.tsv - + rm -f $TEMPFILE + rm -f {output} + for input_file in $FILES; do + ersa --avuncular-adj -ci --alpha {params.alpha} --dmax 14 -t $ERSA_T -l $ERSA_L -th $ERSA_TH $input_file -o $TEMPFILE |& tee {log} - cat $TEMPFILE >> {output} + + if [[ "$input_file" == "${{FILES[0]}}" ]]; then + cat $TEMPFILE >> {output} + else + sed 1d $TEMPFILE >> {output} + fi done """ From 9d147372c4c5d7e8d0d9222f971bdcfec04cd37e Mon Sep 17 00:00:00 2001 From: Aleksandr Medvedev Date: Fri, 24 Dec 2021 13:49:48 +0000 Subject: [PATCH 3/3] small readme fix --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 0fa7700c..e537e14d 100644 --- a/readme.md +++ b/readme.md @@ -175,7 +175,7 @@ g1-b1-i1 g3-b1-i1 2 2 0.2369 0.1163 * `shared_ancestors` is the most likeliest number of shared ancestors, if it is 0, then one relative is a direct descendant of the other, if 1 then they probably have one common ancestor, i.e. half siblings, if 2 then they have common mother and father, for example. * `final_degree` is simply `king_degree` for close relatives up to 3rd degree and `ersa_degree` for distant relatives. - * `total_seg_len` is the total length of all IBD segments, for the first 3 degrees it is calculated using KING IBD data, + * `total_seg_len` is the total length of all IBD1 segments, for the first 3 degrees it is calculated using KING IBD data, for the 4th+ degrees it is calculated using IBID or Germline IBD data. * `seg_count` is the total number of all IBD segments found by KING for the first 3 degrees and found by IBIS\Germline for the 4th+ degrees.