forked from UMN-EGGL/RNAMapping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stringtie.snake
109 lines (90 loc) · 3.96 KB
/
Stringtie.snake
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
from snakemake.remote.S3 import RemoteProvider as S3RemoteProvider
s3_key_id = os.environ.get('AWS_ACCESS_KEY')
s3_access_key = os.environ.get('AWS_SECRET_KEY')
S3 = S3RemoteProvider(
endpoint_url='https://s3.msi.umn.edu',
access_key_id=s3_key_id,
secret_access_key=s3_access_key
)
SAMPLES, = S3.glob_wildcards('HorseGeneAnnotation/private/sequence/RNASEQ/fastq/{sample}_R2_001.fastq.gz')
configfile: "config.yaml"
rule all:
input:
#expand("HorseGeneAnnotation/public/refgen/{GCF}/GFF/Merged/{sample}/{sample}.gff" ,sample=SAMPLES,GCF=config['GCF'])
#expand('HorseGeneAnnotation/public/refgen/{GCF}/GFF/Merged.gff',GCF=config['GCF'])
f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/Merged/transcript_fpkm.tsv",
f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/Merged/gene_fpkm.tsv"
rule sort_bam:
input:
bam = S3.remote('HorseGeneAnnotation/private/sequence/RNASEQ/bam/{sample}_Aligned.out.bam')
output:
sortedbam = 'HorseGeneAnnotation/private/sequence/RNASEQ/bam/{sample}.sorted.bam'
shell:
'''
samtools view -u {input.bam} | samtools sort - -o {output.sortedbam}
'''
rule run_stringtie:
input:
bam = 'HorseGeneAnnotation/private/sequence/RNASEQ/bam/{sample}.sorted.bam',
gff = S3.remote(expand("HorseGeneAnnotation/public/refgen/{GCF}/{GCF}_genomic.nice.gff.gz", GCF=config['GCF']),keep_local=True)
output:
gff = S3.remote(f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/{{sample}}.gff")
run:
shell('''
stringtie \
{input.bam} \
-G {input.gff} \
-o {output.gff}
''')
rule stringtie_merge:
input:
gffs = S3.remote(expand(f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/{{sample}}.gff",sample=SAMPLES),keep_local=True),
refgff = S3.remote(expand("HorseGeneAnnotation/public/refgen/{GCF}/{GCF}_genomic.nice.gff.gz", GCF=config['GCF']),keep_local=True)
output:
merged_gff=expand('HorseGeneAnnotation/public/refgen/{GCF}/GFF/Merged.gff',GCF=config['GCF'])
run:
# Write each gff, one per line
with open('all_GFFs_list.txt','w') as OUT:
print('\n'.join(input.gffs),file=OUT)
shell('''
stringtie \
--merge -p 10 -o {output.merged_gff} \
-G {input.refgff} \
all_GFFs_list.txt
''')
rule stringtie_recalculate_on_merged:
input:
bam = 'HorseGeneAnnotation/private/sequence/RNASEQ/bam/{sample}.sorted.bam',
merged_gff=expand('HorseGeneAnnotation/public/refgen/{GCF}/GFF/Merged.gff',GCF=config['GCF'])
output:
countsdir = directory(f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/Merged/{{sample}}/counts"),
gff = f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/Merged/{{sample}}/{{sample}}.gff"
run:
shell('''
stringtie \
-e \
-b {output.countsdir}\
-G {input.merged_gff} \
-o {output.gff} \
{input.bam}
''')
rule make_FPKM_tables:
input:
counts = expand('HorseGeneAnnotation/public/refgen/{GCF}/GFF/Merged/{sample}/counts/t_data.ctab',GCF=config['GCF'],sample=SAMPLES)
output:
transcript_fpkm = f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/Merged/transcript_fpkm.tsv",
gene_fpkm = f"HorseGeneAnnotation/public/refgen/{config['GCF']}/GFF/Merged/gene_fpkm.tsv"
run:
import pandas as pd
import numpy as np
dfs = []
for sample,f in zip(SAMPLES,input):
df = pd.read_table(f)
df['sample'] = sample
dfs.append(df)
df = pd.concat(dfs)
by_transcript = pd.pivot_table(df,index='t_name',columns='sample',values='FPKM')
by_transcript.to_csv(output.transcript_fpkm,sep='\t')
by_gene = pd.pivot_table(df,index='gene_name',columns='sample',values='FPKM',aggfunc=np.mean)
by_gene.to_csv(output.gene_fpkm,sep='\t')