-
Notifications
You must be signed in to change notification settings - Fork 41
/
wgd_cli.py
executable file
·1482 lines (1275 loc) · 53.6 KB
/
wgd_cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python3.5
"""
--------------------------------------------------------------------------------
Copyright (C) 2018 Arthur Zwaenepoel
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact: [email protected]
--------------------------------------------------------------------------------
The **command line interface** (CLI) is the main way to interact with the `wgd`
package. The CLI is organized with a Click command that wraps a function with
the same name followed by an underscore (this is chosen mostly so that the
pipeline commands can reuse code from other subcommands).
Upon successful installation you should be able to run::
$ wgd -h
Usage: wgd [OPTIONS] COMMAND [ARGS]...
Welcome to the wgd command line interface!
_______
\\ ___ `'.
_ _ .--./) ' |--.\\ \\
/\\ \\ ///.''\\ | | \\ '
`\\ //\\\\ //| | | | | | | '
\\`// \\'/ \`-' / | | | |
\\| |/ /("'` | | ' .'
' \\ '---. | |___.' /'
/'""'.\ /_______.'/
|| ||\\_______|/
\\'. __//
`'---'
wgd Copyright (C) 2018 Arthur Zwaenepoel
This program comes with ABSOLUTELY NO WARRANTY;
This is free software, and you are welcome to redistribute it
under certain conditions;
Contact: [email protected]
Options:
-v, --verbosity [info|debug] Verbosity level, default = info.
-l, --logfile TEXT File to write logs to (optional)
-h, --help Show this message and exit.
Commands:
kde Fit a KDE to a Ks distribution.
ksd Ks distribution construction.
mcl All-vs.-all blastp + MCL analysis.
mix Mixture modeling of Ks distributions.
syn Co-linearity analyses.
viz Plot histograms/densities (interactively).
wf1 Standard workflow whole paranome Ks.
wf2 Standard workflow one-vs-one ortholog Ks.
Note that the verbose flag can be set before the subcommands::
$ wgd --verbose [silent|info|debug] [COMMAND]
Which sets the verbosity for the logging.
All commands are equipped with usage instructions and documentation of options
and arguments, which can be viewed by using the ``--help`` or ``-h`` flag. These
should be quite self-explanatory, but for further documentation you can refer to
the documentation of the specific functions that are called. These can be found
on this page (e.g. the function called by ``wgd blast`` is
:py:func:`wgd_cli.blast_`).
For more information on the methods used in ``wgd`` to compute |Ks| distributions,
please refer to :ref:`methods`.
Example
=======
Here is a small example on how to use the package through the CLI. This is a
workflow for constructing a |Ks| distribution for a fasta file with CDS
sequences called ``ath.cds.fasta``. File names may be different, but the point
will be clear.
(1) Get the paranome, i.e. perform all-against-all Blastp and MCL clustering,
notice how we specify to use 8 threads::
$ wgd mcl --cds --mcl -s ath.cds.fasta -o ./ -n 8
(2) Construct a |Ks| distribution, use FastTree for inferring the phylogenetic
trees used in the node weighting procedure::
$ wgd ksd -o ./ -n 8 ./ath.mcl ath.cds.fasta
(3) Run I-ADHoRe and get an anchor-point |Ks| distribution, as well as dotplots.
here we need a structural annotation in GFF format (see e.g.
https://bioinformatics.psb.ugent.be/plaza/versions/plaza_v4_dicots/download/ for
examples of such files)::
$ wgd syn ath.gff ath.mcl -ks ath.mcl.ks.tsv -f gene -a ID
(4) Fit some gaussian mixture models with 1 to 5 components (default
parameters)::
$ wgd mix ath.mcl.ks.tsv -n 1 5
for more information on mixture modeling and some cautionary notes refer to
:ref:`note_on_gmms`
(5) Explore the full and anchor distribution with kernel density estimates
interactively. First run a bokeh server in the background::
$ bokeh serve &
next, execute the following command::
$ wgd viz -i -ks ath.mcl.ks.tsv,ath.mcl.ks_anchors.tsv -l full,anchors
a tab in your default browser should appear. See :ref:`viz_info` for more
information on vizualization with ``wgd viz``
--------------------------------------------------------------------------------
Reference
=========
"""
# TODO's & IDEAS:
# - Use biopython for all sequence handling
# - Use tmp files to keep track of the outputs from subprocesses, because using
# PIPE might result in hangs.
# - Convert input CDS to uppercase (with warning), since wgd blast breaks with
# lowercase nucleotides
# keep these imports to a minimum to speed up initial CLI loading
import click
import logging
import sys
import os
import warnings
import pandas as pd
import coloredlogs
import subprocess
from wgd.utils import translate_cds, read_fasta, write_fasta, can_i_run_software
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# CLI ENTRY POINT --------------------------------------------------------------
@click.group(context_settings={'help_option_names': ['-h', '--help']})
@click.version_option("v1.1", prog_name="wgd", help="Print version number.")
@click.option(
'--verbosity', '-v', type=click.Choice(['info', 'debug']),
default='info', help="Verbosity level, default = info."
)
@click.option(
'--logfile', '-l', default=None,
help="File to write logs to (optional)."
)
def cli(verbosity, logfile):
"""
Welcome to the wgd command line interface!
\b
_______
\\ ___ `'.
_ _ .--./) ' |--.\\ \\
/\\ \\\\ ///.''\\\\ | | \\ '
`\\\\ //\\\\ //| | | | | | | '
\\`// \\'/ \\`-' / | | | |
\\| |/ /("'` | | ' .'
' \\ '---. | |___.' /'
/'""'.\\ /_______.'/
|| ||\\_______|/
\\'. __//
`'---'
\b
wgd Copyright (C) 2018 Arthur Zwaenepoel
This program comes with ABSOLUTELY NO WARRANTY;
This is free software, and you are welcome to redistribute it
under certain conditions;
Contact: [email protected]
"""
if not logfile:
coloredlogs.install(
fmt='%(asctime)s: %(levelname)s\t%(message)s',
level=verbosity.upper(), stream=sys.stdout
)
else:
print('Logs will be written to {}'.format(logfile))
logging.basicConfig(
filename=logfile,
filemode='a',
format='%(asctime)s: %(levelname)s\t%(message)s',
datefmt='%H:%M:%S',
level=verbosity.upper()
)
# get round problem with python multiprocessing library that can set all cpu
# affinities to a single cpu found in OrthoFinder source code
if sys.platform.startswith("linux"):
with open(os.devnull, "w") as f:
subprocess.call("taskset -p 0xffffffffffff %d" % os.getpid(),
shell=True, stdout=f)
# Check and optionally refactor data -------------------------------------------
@cli.command(context_settings={'help_option_names': ['-h', '--help']})
@click.argument('sequences', nargs=-1, type=click.Path(exists=True))
@click.option('--rename', is_flag=True, help='rename sequences')
@click.option('--prefix', default=None, help='prefix strings')
@click.option('--out', default=None, help='output file name prefixes')
def pre(sequences, rename, prefix, out):
"""
Check and optionally rename CDS files.
Example usage (renaming)
wgd pre ath.cds.fasta vvi.cds.fasta --rename --prefix ath,vvi
Example usage (just checking files)
wgd pre ath.cds.fasta
"""
from wgd.pre import check_cds
if prefix:
prefixes = prefix.split(",")
else:
prefixes = sequences
if out:
out = out.split(",")
else:
out = [s + ".pre" for s in sequences]
for i, seq in enumerate(sequences):
logging.info("({}) Checking {}".format(i, seq))
check_cds(seq, out[i] + ".good", out[i] + ".bad",
rename=rename, prefix=prefixes[i])
# BLAST AND MCL ----------------------------------------------------------------
@cli.command(context_settings={'help_option_names': ['-h', '--help']})
@click.option(
'--cds', is_flag=True,
help='sequences are CDS'
)
@click.option(
'--mcl', is_flag=True,
help='perform MCL clustering'
)
@click.option(
'--one_v_one', is_flag=True,
help='get one vs. one orthologs'
)
@click.option(
'--sequences', '-s', default=None,
help='input fasta files, as a comma separated string (e.g. '
'x.fasta,y.fasta,z.fasta) or directory name.'
)
@click.option(
'--species_ids', '-id', default=None,
help='species identifiers for respective input sequence files, as comma'
' separated string (e.g. x,y,z). (optional)'
)
@click.option(
'--blast_results', '-b', default=None,
help='input precomputed tab separated blast results'
)
@click.option(
'--inflation_factor', '-I', default=2.0, show_default=True,
help="inflation factor for MCL clustering"
)
@click.option(
'--eval_cutoff', '-e', default=1e-10, show_default=True,
help="e-value cut-off for blast results"
)
@click.option(
'--output_dir', '-o', default='wgd_blast', show_default=True,
help='output directory'
)
@click.option(
'--n_threads', '-n', default=4, show_default=True,
help='number of threads used by blastp'
)
def mcl(
cds, mcl, one_v_one, sequences, species_ids, blast_results,
inflation_factor, eval_cutoff, output_dir, n_threads
):
"""
All-vs.-all blastp + MCL clustering.
Requires blastp, makeblastdb (ncbi-blast+ suite) and mcl. Note the two key
parameters, being the e-value cut-off and inflation factor. It is advised to
explore the effects of these on your analysis.
Example 1 - whole paranome delineation:
wgd mcl --cds --mcl -s thorny_devil.fasta -o thorny_devil_blast_out
Example 2 - one vs. one ortholog delineation:
wgd mcl --cds --one_v_one -s equus_ferus.fasta,ursus_arctos.fasta
-id horse,bear -e 1e-8 -o bear_horse_out
wgd Copyright (C) 2018 Arthur Zwaenepoel
This program comes with ABSOLUTELY NO WARRANTY;
"""
# lazy imports
blast_mcl(cds, mcl, one_v_one, sequences, species_ids, blast_results,
inflation_factor, eval_cutoff, output_dir, n_threads)
def blast_mcl(
cds=True, mcl=True, one_v_one=False, sequences=None,
species_ids=None, blast_results=None, inflation_factor=2.0,
eval_cutoff=1e-10, output_dir='wgd_blast', n_threads=4
):
"""
All vs. all Blast + MCL pipeline. For usage in the ``wgd`` CLI. Can be used
to perform all vs. all Blast, MCL clustering and one vs. one ortholog
delineation.
:param cds: boolean, indicates that the provided sequences are CDS
:param mcl: boolean, perform MCL clustering
:param one_v_one: boolean, identify whether one vs. one orthologs are to be
inferred (reciprocal best hits) (True) or a paranome (False).
:param sequences: CDS fasta files, if multiple (for one vs. one ortholog
identification), then as a comma-separated string e.g.
``ath.fasta,aly.fasta``
:param species_ids: comma-separated species ids, optional for one-vs-one
ortholog delineation (will prefix the sequence IDs in that case).
:param blast_results: precomputed blast results (tab separated blast output
style)
:param inflation_factor: inflation factor for MCL clustering
:param eval_cutoff: e-value cut off for blastp analysis
:param output_dir: output directory
:param n_threads: number of threads to use
:return: output file name
"""
# lazy imports
from wgd.blast_mcl import all_v_all_blast, run_mcl_ava, ava_blast_to_abc, \
get_one_v_one_orthologs_rbh
from wgd.utils import uniq_id
# software checks
software = []
if not blast_results:
software += ['makeblastdb', 'blastp']
if mcl:
software.append('mcl')
if can_i_run_software(software) == 1:
logging.error('Could not run all required software, will exit here.')
return
# input checks
if not sequences and not blast_results:
logging.error('No sequences nor blast results provided! Please use the '
'--help flag for usage instructions.')
return
if not os.path.exists(output_dir):
logging.info('Output directory: {} does not exist, will make it.'
''.format(output_dir))
os.mkdir(output_dir)
# all vs. all blast
if not blast_results:
# get sequences from directory
if type(sequences) == tuple:
sequence_files = sequences
elif os.path.isdir(sequences):
sequence_files = sorted([os.path.join(sequences, x) for x
in os.listdir(sequences) if
x.endswith(('fa', 'fasta', 'tfa', 'faa'))])
logging.info('Read the following fasta files from directory {}:'
''.format(sequences))
logging.info('{}'.format(', '.join(sequence_files)))
# sequences as comma-separated string
else:
sequence_files = sequences.strip().split(',')
# input checks
if len(sequence_files) != 1 and not one_v_one:
logging.info('More then one fasta file provided and one_v_one flag '
'not set, will perform all-vs.-all blast.')
if len(sequence_files) != 2 and one_v_one:
logging.error('Please two fasta files for one-vs-one ortholog '
'finding')
return
if species_ids:
ids = species_ids.strip().split(',')
if len(ids) != len(sequence_files):
logging.error('Number of species identifiers ({0}) does not '
'match number of provided sequence files ({1}).'
''.format(len(ids), len(sequence_files)))
return
elif one_v_one:
ids = [os.path.basename(x) for x in sequence_files]
else:
ids = [''] * len(sequence_files)
# get protein sequences
if cds:
logging.info("CDS sequences provided, will first translate.")
protein_sequences = []
for i in range(len(sequence_files)):
if cds:
protein_seqs = translate_cds(read_fasta(
sequence_files[i], prefix=ids[i]))
protein_sequences.append(protein_seqs)
else:
protein_sequences.append(read_fasta(
sequence_files[i], prefix=ids[i]))
# blast
logging.info('Writing blastdb sequences to db.fasta.')
db = os.path.join(output_dir, uniq_id() + '.db.fasta')
# all-vs.-all
d = {}
for x in protein_sequences:
d.update(x)
write_fasta(d, db)
query = os.path.join(output_dir, uniq_id() + '.query.fasta')
logging.info('Writing query sequences to query.fasta.')
write_fasta(d, query)
# start the blast
logging.info('Performing all-vs.-all Blastp (this might take a while)')
blast_results = all_v_all_blast(
query, db, output_dir,
output_file='{}.blast.tsv'.format(
'_'.join([os.path.basename(x) for x in sequence_files])
),
eval_cutoff=eval_cutoff,
n_threads=n_threads
)
logging.info('Blast done')
# remove redundant files
os.system('rm {}'.format(db))
if db != query:
os.system('rm {}'.format(query))
# get one-vs-one orthologs (RBHs)
if one_v_one:
logging.info('Retrieving one vs. one orthologs')
one_v_one_out = get_one_v_one_orthologs_rbh(blast_results, output_dir)
logging.info('Done')
return one_v_one_out
# get paranome (MCL)
if mcl:
logging.info('Performing MCL clustering (inflation factor = {0})'
''.format(inflation_factor))
ava_graph = ava_blast_to_abc(blast_results)
mcl_out = run_mcl_ava(
ava_graph, output_dir=output_dir,
output_file='{}.mcl'.format(os.path.basename(blast_results)),
inflation=inflation_factor
)
logging.info('Done')
return mcl_out
return blast_results
# Diamond + MCL
@cli.command(context_settings={'help_option_names': ['-h', '--help']})
@click.argument('sequences', nargs=-1, type=click.Path(exists=True))
@click.option('--outdir', '-o', default='wgd_dmd', show_default=True,
help='output directory')
@click.option('--tmpdir', '-t', default=None, show_default=True,
help='tmp directory')
@click.option('--inflation', '-I', default=2.0, help="inflation factor for MCL")
@click.option('--eval', '-e', default=1e-10, help="e-value cut-off for similarity")
@click.option('--ignorestop', is_flag=True, help="translate through STOP codons")
@click.option('--nostrictcds', is_flag=True, help="do not enforce proper CDS sequences")
def dmd(sequences, outdir, tmpdir, inflation, eval, ignorestop, nostrictcds):
"""
All-vs.-all diamond blastp + MCL clustering.
Requires diamond and mcl. Note the two key parameters, being the e-value
cut-off and inflation factor. It is advised to explore the effects of these
on your analysis.
Example 1 - whole paranome delineation:
wgd dmd ath.fasta
Example 2 - one vs. one ortholog delineation:
wgd dmd ath.fasta vvi.fasta
Example 3 - one vs. one ortholog delineation for multiple pairs:
wgd dmd ath.fasta vvi.fasta egr.fasta
wgd Copyright (C) 2019 Arthur Zwaenepoel
This program comes with ABSOLUTELY NO WARRANTY;
"""
from wgd.diamond import SequenceData
s = [SequenceData(s, out_path=outdir, tmp_path=tmpdir,
to_stop=not ignorestop, cds=not nostrictcds) for s in sequences]
if len(s) == 0:
logging.error("No sequences provided!")
return
elif len(s) == 1:
logging.info("One CDS file: will compute paranome")
s[0].get_paranome(inflation=inflation, eval=eval)
s[0].write_paranome()
else:
logging.info("Multiple CDS files: will compute RBH orthologs")
for i in range(len(s)-1):
for j in range(i+1, len(s)):
logging.info("{} vs. {}".format(s[i].prefix, s[j].prefix))
s[i].get_rbh_orthologs(s[j], eval=eval)
s[i].write_rbh_orthologs(s[j])
if tmpdir is None:
[x.remove_tmp(prompt=False) for x in s]
# Ks ANALYSIS USING JOBLIB/ASYNC ----------------------------------------------
@cli.command(context_settings={'help_option_names': ['-h', '--help']})
@click.argument('gene_families', nargs=1, type=click.Path(exists=True))
@click.argument('sequences', nargs=-1, type=click.Path(exists=True))
@click.option(
'--output_directory', '-o', default='wgd_ksd', show_default=True,
help='output directory'
)
@click.option(
'--protein_sequences', '-p', default=None,
help="protein sequences fasta file (optional)"
)
@click.option(
'--tmp_dir', '-tmp', default=None, show_default=True,
help="path to store temporary files"
)
@click.option(
'--aligner', '-a', default='mafft', show_default=True,
type=click.Choice(['muscle', 'prank', 'mafft']),
help='aligner program to use, from fast to slow: muscle, prank'
)
@click.option(
'--times', '-t', default=1, show_default=True,
help="number of times to perform ML estimation"
)
@click.option(
'--min_msa_length', '-mml', default=100, show_default=True,
help="minimum MSA length for pairwise mode"
)
@click.option(
'--n_threads', '-n', default=4, show_default=True,
help="number of threads to use"
)
@click.option(
'--wm', '-w', type=click.Choice(['alc', 'fasttree', 'phyml']),
default='fasttree', show_default=True,
help="node weighting method, from fast to slow: alc, fasttree, phyml"
)
@click.option(
'--pairwise', is_flag=True,
help="perform the analysis using a pairwise approach"
)
@click.option(
'--max_pairwise', '-mp', default=10000, show_default=True,
help="maximum number of pairwise combinations a family may have"
)
@click.option(
'--ignore_prefixes', is_flag=True,
help="ignore gene ID prefixes (defined by the '|' symbol) in the gene "
"families file."
)
@click.option(
'--one_v_one', is_flag=True,
help="one vs one ortholog distribution"
)
@click.option(
'--preserve', is_flag=True,
help="keep multiple sequence alignment, codeml output and trees"
)
@click.option(
'--codeml_options', default="",
help="other options for codeml, as a comma separated string, "
"e.g. getSE=1,CodonFreq=1"
)
def ksd(
gene_families, sequences, output_directory, protein_sequences, tmp_dir,
aligner, times, min_msa_length, n_threads, wm, pairwise,
max_pairwise, ignore_prefixes, one_v_one, preserve, codeml_options
):
"""
Ks distribution construction.
Ks distribution construction for a set of paralogs or one-to-one orthologs.
This implementation uses the joblib library for parallelization.
Requires both ``codeml`` and ``muscle|mafft|prank`` for alignment. Depending
on the weighting method chosen (``wm`` option) ``phyml`` or ``FastTree``
might also be required.
Example 1 - whole paranome Ks distribution:
wgd ksd --n_threads 8 fringilla_coelebs.mcl fringilla_coelebs.cds.fasta
Example 2 - one vs. one ortholog Ks distribution:
wgd ksd -o beaver_eagle orthologs.tsv beaver.cds.fasta eagle.cds.fasta
wgd Copyright (C) 2018 Arthur Zwaenepoel
This program comes with ABSOLUTELY NO WARRANTY;
"""
codeml_opts = {x.split("=")[0].strip(): x.split("=")[1].strip()
for x in codeml_options.split(",") if x != ""}
logging.debug(codeml_opts)
ksd_(
gene_families, sequences, output_directory, protein_sequences,
tmp_dir, aligner, codeml='codeml',
times=times, min_msa_length=min_msa_length,
ignore_prefixes=ignore_prefixes, one_v_one=one_v_one,
preserve=preserve, n_threads=n_threads,
weighting_method=wm, pairwise=pairwise,
max_pairwise=max_pairwise, **codeml_opts
)
def ksd_(
gene_families, sequences, output_directory, protein_sequences=None,
tmp_dir=None, aligner='muscle', codeml='codeml', times=1,
min_msa_length=100, ignore_prefixes=False, one_v_one=False,
pairwise=False, preserve=False, n_threads=4,
weighting_method='fasttree', max_pairwise=10000, **kwargs
):
"""
Ks distribution construction pipeline. For usage in the ``wgd`` CLI.
:param gene_families: gene families, i.e. tab separated paralogs or
one-vs-one orthologs (see :py:func:`blast_`)
:param sequences: CDS fasta files, if multiple (for constructing one-vs.-one
ortholog distribution) then as a comma separated string
:param output_directory: output directory
:param protein_sequences: protein sequences (optional), by default CDS files
are translated using the standard genetic code.
:param tmp_dir: tmp directory name (optional)
:param aligner: aligner to use
:param codeml: path to codeml executable
:param times: number of times to iteratively perform ML estimation of Ks,
Ka and omega values.
:param min_msa_length: minimum multiple sequence alignment length
:param ignore_prefixes: ignore prefixes defined by '|' in gene IDs
:param one_v_one: boolean, one-vs.-one ortholog analysis
:param pairwise: run in pairwise mode
:param preserve: boolean, preserve codeml output files, multiple sequence
alignments and trees?
:param async: use the async library for parallelization (not recommended)
:param n_threads: number of threads to use
:param weighting_method: weighting method (fasttree, phyml or alc)
:param max_pairwise: maximum number of pairwise combinations a gene family
may have. This effectively filters out families of size n where
n*(n-1)/2 exceeds `max_pairwise`.
:return: output file name
"""
# lazy imports
from wgd.ks_distribution import ks_analysis_paranome, ks_analysis_one_vs_one
from wgd.viz import plot_selection
from wgd.utils import uniq_id
# software check
software = [codeml, aligner]
if weighting_method == 'fasttree':
software.append('FastTree')
elif weighting_method == 'phyml':
software.append('phyml')
if can_i_run_software(software) == 1:
logging.error(
'Could not run all required external software, exit here.')
return 1
# input check
if not (gene_families and sequences):
logging.error('No gene families or no sequences provided.')
return 1
if not tmp_dir:
# unique tmp directory
tmp_dir = os.path.join('.', 'ks_tmp.' + uniq_id())
# get absolute paths before changing dir
output_directory = os.path.abspath(output_directory)
tmp_dir = os.path.abspath(tmp_dir)
gene_families = os.path.abspath(gene_families)
# some warnings
if os.path.exists(output_directory):
logging.warning('Output directory exists, will possibly overwrite')
else:
os.mkdir(output_directory)
if os.path.exists(tmp_dir):
logging.info('tmp directory exists, will try to resume analysis.')
else:
os.mkdir(tmp_dir)
logging.debug('Reading CDS sequences')
seq_list = [os.path.abspath(x) for x in sequences]
cds_seqs = {}
for seq_file in seq_list:
cds_seqs.update(read_fasta(seq_file))
# translate CDS file(s)
if not protein_sequences:
logging.info('Translating CDS file')
protein_seqs = translate_cds(cds_seqs)
else:
logging.debug('Reading protein sequences')
seq_list = [os.path.abspath(x) for x in
protein_sequences.strip().split(',')]
protein_seqs = {}
for seq_file in seq_list:
protein_seqs.update(read_fasta(seq_file))
# define a base name for the output files
base = '_'.join([os.path.basename(x) for x in seq_list])
# one-vs-one ortholog input
if one_v_one:
os.chdir(tmp_dir) # chdir is necessary because codeml produces these
# rub, rst and rst1 files
logging.info('Started one-vs-one ortholog Ks analysis')
results = ks_analysis_one_vs_one(
cds_seqs, protein_seqs, gene_families,
tmp_dir, output_directory, codeml,
n_threads=n_threads, preserve=preserve,
times=times,
aligner=aligner,
**kwargs
)
results.round(5).to_csv(os.path.join(
output_directory, '{}.ks.tsv'.format(base)), sep='\t')
logging.info('Generating plots')
plot_selection(
results, title=os.path.basename(gene_families),
output_file=os.path.join(output_directory, '{}.ks.svg'.format(base))
)
logging.info('Done')
return os.path.join(output_directory, '{}.ks.tsv'.format(base))
# whole paranome ks analysis
else:
os.chdir(tmp_dir) # change directory to the tmp dir, as codeml writes
# non-unique file names to the working dir
logging.info('Started whole paranome Ks analysis')
results = ks_analysis_paranome(
cds_seqs, protein_seqs, gene_families,
tmp_dir, output_directory,
codeml, preserve=preserve,
times=times, aligner=aligner,
ignore_prefixes=ignore_prefixes,
n_threads=n_threads,
min_length=min_msa_length,
method=weighting_method,
pairwise=pairwise,
max_pairwise=max_pairwise,
**kwargs
)
results.round(5).to_csv(os.path.join(
output_directory, '{}.ks.tsv'.format(base)), sep='\t')
logging.info('Generating plots')
plot_selection(
results, title=os.path.basename(gene_families),
output_file=os.path.join(output_directory, '{}.ks.svg'.format(base))
)
logging.info('Done')
return os.path.join(output_directory, '{}.ks.tsv'.format(base))
# CO-LINEARITY -----------------------------------------------------------------
@cli.command(context_settings={'help_option_names': ['-h', '--help']})
@click.argument('gff_file', default=None, type=click.Path(exists=True))
@click.argument('gene_families', default=None, type=click.Path(exists=True))
@click.option(
'--ks_distribution', '-ks', default=None,
help="paranome ks distribution tsv file (optional, see `wgd ks`)"
)
@click.option(
'--output_dir', '-o', default='./wgd_syn', show_default=True,
help='output directory'
)
@click.option(
'--feature', '-f', default='gene', show_default=True,
help="keyword for parsing the genes from the GFF file (column 3)"
)
@click.option(
'--gene_attribute', '-a', default='ID', show_default=True,
help="keyword for parsing the gene IDs from the GFF file (column 9)"
)
@click.option(
'--min_length', '-l', default=250, show_default=True,
help="minimum length of a genomic element (in numbers of genes) to be "
"included in dotplot."
)
@click.option(
'--ks_range', '-r', nargs=2, default=(0.05, 5), show_default=True,
type=float, help='Ks range to use for colored dotplot'
)
@click.option(
'--iadhore_options', default="",
help="other options for I-ADHoRe, as a comma separated string, "
"e.g. gap_size=30,q_value=0.75,prob_cutoff=0.05"
)
def syn(
gff_file, gene_families, ks_distribution, output_dir, feature,
gene_attribute, min_length, ks_range, iadhore_options
):
"""
Co-linearity analyses.
Requires I-ADHoRe 3.0 and a structural annotation in GFF format. Use the
``--feature`` and ``--gene_attribute`` flags to ensure correspondence
between your CDS sequences and the right features in the GFF file!
Example:
wgd syn -f gene -a id -ks panda.ks panda.gff panda.paranome.mcl
wgd Copyright (C) 2018 Arthur Zwaenepoel
This program comes with ABSOLUTELY NO WARRANTY;
"""
iadhore_opts = {x.split("=")[0].strip(): x.split("=")[1].strip()
for x in iadhore_options.split(",") if x != ""}
if len(iadhore_opts) > 0:
logging.info("I-ADHoRe 3.0 options: {}".format(iadhore_opts))
syn_(
gff_file, gene_families, output_dir, ks_distribution, feature,
gene_attribute, min_length, ks_range, **iadhore_opts
)
def syn_(
gff_file, families, output_dir, ks_distribution, feature='mRNA',
gene_attribute='Parent', min_length=250, ks_range=(0.05, 5),
**kwargs
):
"""
Co-linearity analysis with I-ADHoRe 3.0. For usage in the ``wgd`` CLI.
:param gff_file: GFF3 annotation file (see the annotation files on PLAZA as
an example)
:param families: gene families as tab separated gene IDs, see
:py:func:`blast_`
:param output_dir: output directory
:param ks_distribution: Ks distribution tsv file, see :py:func:`ks_`
:param feature: keyword for entities of interest in the GFF file, e.g.
'CDS' or 'mRNA'
:param gene_attribute: attribute key for the gene ID in the GFF (9th
column), e.g. 'ID' or 'Parent'
:return: nothing at all
"""
# lazy imports
from wgd.colinearity import write_families_file, write_gene_lists
from wgd.colinearity import write_config_adhore, run_adhore
from wgd.colinearity import get_anchor_pairs, gff_parser
from wgd.viz import plot_selection, syntenic_dotplot
from wgd.viz import syntenic_dotplot_ks_colored
# software check
if can_i_run_software(['i-adhore']) == 1:
logging.error('Could not run all software, exit here.')
return 1
# input check
if not gff_file:
logging.error('No gff file provided! Run `wgd syn --help` for usage '
'instructions.')
return 1
if not families:
logging.error('No gene families provided! Run `wgd syn --help` for '
'usage instructions.')
return 1
if os.path.exists(output_dir):
logging.warning(
'Output directory already exists, will possibly overwrite')
else:
os.mkdir(output_dir)
logging.info('Made output directory {0}'.format(output_dir))
# parse the gff
logging.info("Parsing GFF file")
try:
genome, all_genes = gff_parser(
gff_file, feature=feature, gene_attribute=gene_attribute)
except IndexError:
logging.error('Invalid GFF file, number of columns != 9')
logging.error('Aborted')
return 1
# generate necessary files for i-adhore
logging.info("Writing gene lists")
write_gene_lists(genome, os.path.join(output_dir, 'gene_lists'))
logging.info("Writing families file")
write_families_file(
families, all_genes, os.path.join(output_dir, 'families.tsv'))
logging.info("Writing configuration file")
write_config_adhore(
os.path.join(output_dir, 'gene_lists'),
os.path.join(output_dir, 'families.tsv'),
config_file_name=os.path.join(output_dir, 'adhore.conf'),
output_path=os.path.join(output_dir, 'i-adhore-out'),
**kwargs)
# run i-adhore
logging.info("Running I-ADHoRe 3.0")
run_adhore(os.path.join(output_dir, 'adhore.conf'))
# dotplot
multiplicons = pd.read_csv(os.path.join(
output_dir, 'i-adhore-out', 'multiplicons.txt'), sep='\t')
logging.info('Drawing co-linearity dotplot')
syntenic_dotplot(
multiplicons, min_length=min_length, output_file=os.path.join(
output_dir,
'{}.dotplot.svg'.format(os.path.basename(families)))
)
# Ks distribution for anchors and Ks colored dotplot
if ks_distribution:
# input files
anchor_points = pd.read_csv(os.path.join(
output_dir, 'i-adhore-out', 'anchorpoints.txt'), sep='\t')
ks_dist = pd.read_csv(ks_distribution, index_col=0, sep='\t')
# output file names
ks_out = os.path.join(output_dir, '{}.ks_anchors.tsv'.format(
os.path.basename(families)))
dotplot_out = os.path.join(output_dir, '{}.dotplot.ks.svg'.format(
os.path.basename(families)))
hist = os.path.join(output_dir, '{}.ks_anchors.svg'.format(
os.path.basename(families)))
# output and plots
logging.info("Constructing Ks distribution for anchors")
ks, anchors = get_anchor_pairs(anchor_points, ks_dist, ks_out)
logging.info("Generating Ks colored (median Ks) dotplot")
syntenic_dotplot_ks_colored(
multiplicons, anchor_points, anchors, min_ks=ks_range[0],
max_ks=ks_range[1], output_file=dotplot_out,
min_length=min_length
)
logging.info("Generating histogram")
plot_selection([ks, anchors], alphas=[0.2, 0.7], output_file=hist,
title=os.path.basename(families), weighted=False,
labels=['Whole paranome', 'Anchors'])
logging.info("Done")
# KDE FITTING ------------------------------------------------------------------
@cli.command(context_settings={'help_option_names': ['-h', '--help']})
@click.argument(
'ks_distribution', type=click.Path(exists=True), default=None
)
@click.option(
'--filters', '-f', nargs=3, type=float, default=(0., 300., 0.),
help="Data frame filters (alignment identity, length and coverage)",
show_default=True
)
@click.option(
'--ks_range', '-r', nargs=2, default=(0, 3), show_default=True,
type=float,
help='Ks range to use for modeling'
)
@click.option(
'--bandwidth', '-bw', default=None, show_default=True, type=float,
help="Bandwidth for Gaussian KDE, by default Scott's rule is used"
)