Skip to content

Commit

Permalink
fix: use canonical transcript as fallback for quantseq data
Browse files Browse the repository at this point in the history
  • Loading branch information
Addimator committed Jan 15, 2024
1 parent 00247ce commit e54ecdc
Show file tree
Hide file tree
Showing 11 changed files with 75 additions and 61 deletions.
2 changes: 1 addition & 1 deletion workflow/resources/datavzrd/diffexp-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ views:
chromosome_name:
optional: true
display-mode: hidden
transcript_mane_select:
main_transcript_per_gene:
optional: true
display-mode: hidden
ensembl_transcript_id_version:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ enrichment_env = render_enrichment_env()

def kallisto_quant_input(wildcards):
if is_3prime_experiment:
return "results/mane_3prime_reads/{sample}-{unit}.fastq"
return "results/main_transcript_3prime_reads/{sample}-{unit}.fastq"
elif not is_single_end(wildcards.sample, wildcards.unit):
return expand(
"results/trimmed/{{sample}}-{{unit}}.{group}.fastq.gz", group=[1, 2]
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/qc_3prime.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ rule get_aligned_pos:
rule get_selected_transcripts_aligned_read_bins:
input:
aligned_file="results/QC/{sample}-{unit}.aligned.txt",
transcripts_annotation="resources/transcripts_annotation.mane_strand_length.tsv",
transcripts_annotation="resources/transcripts_annotation.main_transcript_strand_length.tsv",
read_length="results/stats/max-read-length.json",
output:
fwrd_allsamp_hist_fil=temp(
Expand Down
20 changes: 10 additions & 10 deletions workflow/rules/quant_3prime.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,29 @@ rule bwa_mem:
"v1.17.2/bio/bwa/mem"


rule get_only_mane_select_reads_closest_to_3_prime:
rule get_only_main_transcript_reads_closest_to_3_prime:
input:
bam="results/mapped_mem/{sample}-{unit}.namesorted.bam",
annotation="resources/transcripts_annotation.mane_strand_length.tsv",
annotation="resources/transcripts_annotation.main_transcript_strand_length.tsv",
output:
mane_select_reads_closest_to_3_prime=temp(
"results/mapped_3prime_mane/{sample}-{unit}.mane_select_closest_to_3_prime.bam"
main_transcript_reads_closest_to_3_prime=temp(
"results/mapped_3prime_main_transcript/{sample}-{unit}.main_transcript_closest_to_3_prime.bam"
),
log:
"logs/mapped_3prime_bam/{sample}-{unit}.mapped.pos.log",
conda:
"../envs/pysam.yaml"
script:
"../scripts/get-only-mane-select-reads-closest-to-3-prime.py"
"../scripts/get-only-main-transcript-reads-closest-to-3-prime.py"


rule get_mane_fastq:
rule get_main_transcript_fastq:
input:
bam="results/mapped_3prime_mane/{sample}-{unit}.mane_select_closest_to_3_prime.bam",
bam="results/mapped_3prime_main_transcript/{sample}-{unit}.main_transcript_closest_to_3_prime.bam",
output:
fastq="results/mane_3prime_reads/{sample}-{unit}.fastq",
fastq="results/main_transcript_3prime_reads/{sample}-{unit}.fastq",
log:
"logs/mane_3prime_reads/{sample}-{unit}.log",
"logs/main_transcript_3prime_reads/{sample}-{unit}.log",
conda:
"../envs/samtools.yaml"
shell:
Expand All @@ -60,7 +60,7 @@ rule get_mane_fastq:

rule kallisto_3prime_index:
input:
fasta="resources/transcriptome.cdna.without_poly_a.mane.fasta",
fasta="resources/transcriptome.cdna.without_poly_a.main_transcript.fasta",
output:
index="results/kallisto_3prime/transcripts.3prime.idx",
log:
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ rule get_transcript_info:
multiext(
"resources/transcripts_annotation",
".results.rds",
".mane_strand_length.tsv",
".main_transcript_strand_length.tsv",
),
params:
species=get_bioc_species_name(),
Expand Down
8 changes: 4 additions & 4 deletions workflow/rules/ref_3prime.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ rule cds_polyA_T_removal:
"../scripts/remove_poly_tails.py"


rule get_mane_transcripts_fasta:
rule get_main_transcripts_fasta:
input:
fasta="resources/transcriptome.cdna.without_poly_a.fasta",
mane_select_transcripts="resources/transcripts_annotation.mane_strand_length.tsv",
main_transcripts_per_gene="resources/transcripts_annotation.main_transcript_strand_length.tsv",
output:
"resources/transcriptome.cdna.without_poly_a.mane.fasta",
"resources/transcriptome.cdna.without_poly_a.main_transcript.fasta",
log:
"logs/get_canonical_transcripts/get_canonical_transcripts.log",
conda:
"../envs/biopython.yaml"
script:
"../scripts/get_mane_select_transcripts_fasta.py"
"../scripts/get_main_transcripts_per_gene_fasta.py"
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ def print_read_closest_to_3_prime(read_set: set[pysam.AlignedSegment]) -> None:
if read.is_mapped:
transcript = transcript_annotations.loc[
transcript_annotations["transcript"] == read.reference_name,
[
"transcript_length",
"transcript_mane_select"
]
["transcript_length", "main_transcript_per_gene"],
].reset_index()
if len(transcript) == 0:
sys.stderr.write(f"Warning: Transcript '{read.reference_name}' not found in downloaded annotations. Skipping read '{read.query_name}' that maps to it.")
sys.stderr.write(
f"Warning: Transcript '{read.reference_name}' not found in downloaded annotations. Skipping read '{read.query_name}' that maps to it."
)
continue
# use read start distance, as reference skips increase that distance
# and also indicate a suboptimal alignment
Expand All @@ -32,11 +31,17 @@ def print_read_closest_to_3_prime(read_set: set[pysam.AlignedSegment]) -> None:
min_dist = distance
min_dist_read = read
min_dist_transcript = transcript
if min_dist != -1 and min_dist_transcript.at[0, "transcript_mane_select"] == 1:
if min_dist != -1 and min_dist_transcript.at[0, "main_transcript_per_gene"] == 1:
records_out.write(min_dist_read)


with pysam.AlignmentFile(snakemake.input["bam"], "rb") as records_in, pysam.AlignmentFile(snakemake.output["mane_select_reads_closest_to_3_prime"], "wb", template=records_in) as records_out:
with pysam.AlignmentFile(
snakemake.input["bam"], "rb"
) as records_in, pysam.AlignmentFile(
snakemake.output["main_transcript_reads_closest_to_3_prime"],
"wb",
template=records_in,
) as records_out:
record_iterator = records_in.fetch(until_eof=True)
current_read = next(record_iterator)
current_queryname_set = set([current_read])
Expand All @@ -47,4 +52,4 @@ def print_read_closest_to_3_prime(read_set: set[pysam.AlignedSegment]) -> None:
else:
current_queryname_set.add(next_read)
current_read = next_read
print_read_closest_to_3_prime(current_queryname_set)
print_read_closest_to_3_prime(current_queryname_set)
6 changes: 4 additions & 2 deletions workflow/scripts/get-sample-hist-bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
trans_length_data = pd.read_csv(
snakemake.input["transcripts_annotation"],
sep="\t",
).drop(columns = ["transcript_mane_select"])
).drop(columns=["main_transcript_per_gene"])

# Aligned text file reading
align_bam_txt = pd.read_csv(
Expand Down Expand Up @@ -134,7 +134,9 @@
elif transcript_ids == "all":
# Values added to corresponding bins
Freq_rev, bins_rev = np.histogram(
read_min["start"], bins=read_length, range=[0, max(read_min["transcript_length"])]
read_min["start"],
bins=read_length,
range=[0, max(read_min["transcript_length"])],
)
Freq_rev_trim, bins_rev_trim = np.histogram(
read_min["start"], bins=read_length, range=[0, 20000]
Expand Down
28 changes: 20 additions & 8 deletions workflow/scripts/get-transcript-info.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,16 @@ three_prime_attributes <- c(
"strand"
)

if (three_prime_activated & !("transcript_mane_select" %in% available_attributes)) {
if (three_prime_activated &
!("transcript_mane_select" %in% available_attributes) &
!("transcript_is_canonical" %in% available_attributes)
) {
cli_abort(
str_c(
"Three prime mode for Lexogen QuantSeq analysis is activated, which ",
"needs the transcript_mane_select attribute from biomart. However, ",
"this attribute is not available for the species '",
"needs the transcript_mane_select or the transcript_is_canonical ",
"attribute from biomart. However, these attributes ",
"are not available for the species '",
snakemake@params[["species"]],
"' in the ensembl release version: ",
snakemake@params[["version"]]
Expand Down Expand Up @@ -168,9 +172,6 @@ other_annotations <- all_annotations |>
# for non-3-prime kallisto-sleuth input
filter(!str_detect(chromosome_name, "patch|PATCH")) |>
select(-c(chromosome_name, sleuth_attributes)) |>
select(
-any_of("transcript_is_canonical")
) |>
rename(transcript = ensembl_transcript_id_version) |>
mutate(
strand = case_match(
Expand All @@ -182,15 +183,26 @@ other_annotations <- all_annotations |>

if ("transcript_mane_select" %in% colnames(other_annotations)) {
other_annotations <- other_annotations |>
select(
-any_of("transcript_is_canonical")
) |>
mutate(
# we don't need the NCBI IDs that match the MANE transcripts, only an
# indicator whether a transcript is in MANE select
transcript_mane_select = if_else(transcript_mane_select != "", 1, 0, NA)
main_transcript_per_gene = if_else(transcript_mane_select != "", 1, 0, NA)
) |>
select(
-any_of("transcript_mane_select")
)
} else {
other_annotations <- other_annotations |>
mutate(
# ensure consistent column presence in the output file
add_column(transcript_mane_select = NA)
main_transcript_per_gene = if_else(transcript_is_canonical, 1, 0, NA)
) |>
select(
-any_of("transcript_is_canonical")
)
}

write_tsv(
Expand Down
20 changes: 20 additions & 0 deletions workflow/scripts/get_main_transcripts_per_gene_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import sys
import pandas as pd
from Bio import SeqIO

sys.stderr = open(snakemake.log[0], "w")

with open(snakemake.output[0], "w") as transcript_clean_cdna_fasta:
all_transcripts = pd.read_csv(
snakemake.input["main_transcripts_per_gene"],
sep="\t",
)
main_transcripts_per_gene = set(
all_transcripts.loc[
all_transcripts["main_transcript_per_gene"] == 1, "transcript"
]
)

for seq_record in SeqIO.parse(snakemake.input["fasta"], "fasta"):
if seq_record.id in main_transcripts_per_gene:
SeqIO.write(seq_record, transcript_clean_cdna_fasta, "fasta")
25 changes: 0 additions & 25 deletions workflow/scripts/get_mane_select_transcripts_fasta.py

This file was deleted.

0 comments on commit e54ecdc

Please sign in to comment.