-
Notifications
You must be signed in to change notification settings - Fork 31
174 lines (158 loc) · 9.37 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
name: CI
on:
push:
branches:
- '*'
pull_request:
branches:
- '*'
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
test:
name: test PPanGGOLiN on ${{ matrix.os }} with python ${{ matrix.python-version }}
# The type of runner that the job will run on
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: ['ubuntu-latest', 'macos-latest']
python-version: ['3.8', '3.9', '3.10']
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v2
# Setting up miniconda
- uses: conda-incubator/setup-miniconda@v2
with:
condarc-file: .condarc.yml
activate-environment: test
python-version: ${{ matrix.python-version }}
# Install the dependencies
- name: Set up test environment
shell: bash -l {0}
run: |
conda install -y --file requirements.txt
pip install .[test]
# Check that it is installed and displays help without error
- name: Check that PPanGGOLiN is installed
shell: bash -l {0}
run: |
ppanggolin --version
ppanggolin --help
# Check that unit tests are all passing
- name: Unit tests
shell: bash -l {0}
run: pytest
# Test the complete workflow
- name: Complete workflow
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin all --cpu 1 --fasta organisms.fasta.list --output mybasicpangenome
ppanggolin info --pangenome mybasicpangenome/pangenome.h5 --content --parameters --status
cd -
# test most options calls. If there is a change in the API somewhere that was not taken into account (whether in the options for the users, or the classes for the devs), this should fail, otherwise everything is probably good.
#--draw_hotspots option is problematic on macOS.
- name: Step by Step workflow with most options calls
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria
ppanggolin cluster -p stepbystep/pangenome.h5 --coverage 0.8 --identity 0.8
ppanggolin graph -p stepbystep/pangenome.h5 -r 10
ppanggolin partition --output stepbystep -f -p stepbystep/pangenome.h5 --cpu 1 -b 2.6 -ms 10 -fd -ck 500 -Kmm 3 12 -im 0.04 --draw_ICL -se $RANDOM
ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM
ppanggolin draw -p stepbystep/pangenome.h5 --tile_plot --nocloud --soft_core 0.92 --ucurve --output stepbystep -f
ppanggolin rgp -p stepbystep/pangenome.h5 --persistent_penalty 2 --variable_gain 1 --min_score 3 --dup_margin 0.05
ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --borders --families_tsv --cpu 1
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta organisms.fasta.list --gff --proksee --table
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
cd -
- name: gbff parsing and MSA computing
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin workflow --cpu 1 --anno organisms.gbff.list --output myannopang
ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy
cd -
- name: clusters reading from external file
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang
ppanggolin annotate --anno organisms.gbff.list --output readclusters
ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5
ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f
cd -
- name: testing rgp_cluster command
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5
ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --ignore_incomplete_rgp --grr_metric max_grr -f --graph_formats graphml gexf
ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --no_identical_rgp_merging -o rgp_clustering_no_identical_rgp_merging --graph_formats graphml
cd -
- name: testing align command
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta \
--output test_align --draw_related --getinfo --fast
cd -
- name: testing context command
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context --fast
# test from gene family ids. Test here with one family of module 1. The context should find all families of module 1
echo AP288_RS05055 > one_family_of_module_1.txt
ppanggolin context --pangenome myannopang/pangenome.h5 --family one_family_of_module_1.txt --output test_context_from_id
cd -
- name: testing metadata command
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s db1 -m metadata/metadata_genes.tsv -a genes
ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s db2 -m metadata/metadata_genomes.tsv -a genomes
ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s db3 -m metadata/metadata_families.tsv -a families --omit
ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s db4 -m metadata/metadata_rgps.tsv -a RGPs
ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s db5 -m metadata/metadata_contigs.tsv -a contigs
ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s db6 -m metadata/metadata_modules.tsv -a modules
ppanggolin write_pangenome -p mybasicpangenome/pangenome.h5 --output mybasicpangenome -f --gexf --light_gexf --cpu 1
ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 -o rgp_cluster_with_metadata --graph_formats graphml
cd -
- name: testing config file
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml
ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml
cd -
- name: testing projection cmd
shell: bash -l {0}
run: |
cd testingDataset
head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee
ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \
--organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
--spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee
- name: testing write_genome_cmds
shell: bash -l {0}
run: |
cd testingDataset
head organisms.gbff.list | cut -f1 > organisms_names.gbff.head.list
ppanggolin write_genomes -p myannopang/pangenome.h5 --output flat_genomes_from_file_org -f \
--anno organisms.gbff.list --gff --organisms organisms_names.gbff.head.list
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output flat_genomes_from_cmdline_orgs --proksee \
--organisms GCF_006508185.1_ASM650818v1_genomic,GCF_002088315.1_ASM208831v1_genomic
head organisms.fasta.list | cut -f1 > organisms_names.fasta.head.list
# Default separator is a pipe but a pipe is found in a value of metadata db1. That is why we use another separator here.
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs \
--organisms organisms_names.fasta.head.list \
-f --gff --add_metadata --metadata_sep §
# Pipe separatore is found in metadata source db1. if we don't require this source then the writting with pipe is work fine.
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs_with_metadata -f --gff --proksee --add_metadata --metadata_sources db2 db3 db4