.github/workflows/main.yml

name: CI

on:
  pull_request:
    branches: 
      - '*'
    paths:
      # if any of this files or directory changed, trigger the CI
      # The only case where it is not triggerd is when docs/ is modified
      - 'tests/**'
      - 'testingDataset/**'
      - '.github/**'
      - 'ppanggolin/**'
      - 'MANIFEST.in'
      - 'VERSION'
      - 'ppanggolin_env.yaml'
      - 'pyproject.toml'
      - 'setup.py'
  # Allows you to run this workflow manually from the Actions tab
  workflow_dispatch:

env:
  NUM_CPUS: 1

# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
  test:
    name: test PPanGGOLiN on ${{ matrix.os }} with python ${{ matrix.python-version }}
    # The type of runner that the job will run on
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: ['macos-14']
        python-version: ['3.8', '3.12']
 
    steps:

    # Get number of cpu available on the current runner
    - name: Get core number on linux
      if: matrix.os == 'ubuntu-latest'
      run: |
        nb_cpu_linux=`nproc`
        echo "Number of cores avalaible on the current linux runner $nb_cpu_linux"
        echo "NUM_CPUS=$nb_cpu_linux" >> "$GITHUB_ENV"

    - name: Get core number on macos
      if: matrix.os == 'macos-14'
      run: |
        nb_cpu_macos=`sysctl -n hw.ncpu`
        echo "Number of cores avalaible on the current macos runner $nb_cpu_macos"
        echo "NUM_CPUS=$nb_cpu_macos" >> "$GITHUB_ENV"

    # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
    - uses: actions/checkout@v4
    # # Install requirements with miniconda
    # - uses: conda-incubator/setup-miniconda@v3
    #   with:
    #     python-version: ${{ matrix.python-version }}
    #     channels: conda-forge,bioconda,defaults
    #     environment-file: ppanggolin_env.yaml
    #     activate-environment: ppanggolin
    - name: Set up Python environment
      run: |
        python3 -m venv ppgg
        source ppgg/bin/activate

    - name: Install dependencies with Homebrew
      run: |
        brew update
        brew upgrade
        brew install mmseqs2
        brew install mafft
        # brew install infernal
        # brew install aragorn
        
    - name:  Install ppanggolin
      run: |
        # Activate the Python environment again in the correct directory
        source ppgg/bin/activate
        pip install .[python_deps]

    # - name: Install ppanggolin
    #   shell: bash -l {0}
    #   run: |
    #     pip install .[test]
    #     mmseqs version

    # Check that it is installed and displays help without error
    - name: Check that PPanGGOLiN is installed
      shell: bash -l {0}
      run: |
        source ppgg/bin/activate
        ppanggolin --version
        ppanggolin --help

    # Check that unit tests are all passing
    - name: Unit tests
      shell: bash -l {0}
      run: pytest   

    - name: gbff parsing and MSA computing
      shell: bash -l {0}
      run: |
        source ppgg/bin/activate
        cd testingDataset
        mkdir info_to_test
        ppanggolin workflow --cpu $NUM_CPUS --anno genomes.gbff.list --output myannopang
        ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy --cpu $NUM_CPUS
        ppanggolin info --pangenome myannopang/pangenome.h5 > info_to_test/myannopang_info.yaml
        cat info_to_test/myannopang_info.yaml
        echo "$(grep 'myannopang/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  myannopang/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
        shasum -a 256 myannopang/gene_families.tsv >> info_to_test/checksum.txt
        cd -
    - name: clusters reading from external file
      shell: bash -l {0}
      run: |
        cd testingDataset
        ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang  --cpu $NUM_CPUS 
        ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS
        awk 'BEGIN{FS=OFS="\t"} {$1 = $1 OFS $1} 1' clusters.tsv > clusters_with_reprez.tsv;
        ppanggolin cluster --clusters clusters_with_reprez.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS
        ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS
        echo "$(grep 'readclusterpang/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  readclusterpang/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
        shasum -a 256 readclusterpang/gene_families.tsv >> info_to_test/checksum.txt
        cd -
    - name: testing context command
      shell: bash -l {0}
      run: |
        cd testingDataset
        ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context --fast --cpu $NUM_CPUS

        # test from gene family ids. Test here with one family of module 1. The context should find all families of module 1
        echo AP288_RS05055 > one_family_of_module_1.txt 
        ppanggolin context --pangenome myannopang/pangenome.h5 --family one_family_of_module_1.txt  --output test_context_from_id --cpu $NUM_CPUS
        cd -
    - name: testing config file
      shell: bash -l {0}
      run: |
        cd testingDataset
        ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml
        cut -f1,2 clusters.tsv > clusters_without_frag.tsv
        ppanggolin panrgp  --anno genomes.gbff.list --cluster clusters_without_frag.tsv -o test_config --config panrgp_default_config.yaml --cpu $NUM_CPUS
        echo "$(grep 'test_config/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  test_config/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
        shasum -a 256 test_config/gene_families.tsv >> info_to_test/checksum.txt        
        cd -
    - name: testing projection cmd
      shell: bash -l {0}
      run: |
        cd testingDataset
        head genomes.fasta.list | sed 's/^/input_genome_/g' > genomes.fasta.head.list
        # ppanggolin projection --pangenome myannopang/pangenome.h5  -o projection_from_list_of_fasta --fasta genomes.fasta.head.list --gff --proksee --cpu $NUM_CPUS

        # projection of a plasmid with chevron that have been added manually to test chevron handeling in GFF
        ppanggolin projection --pangenome myannopang/pangenome.h5 --anno GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz --cpu $NUM_CPUS -o projection_plasmid_with_chevron
        
        echo GFF_plasmid_No_seq$'\t'GBFF/plasmid_GCF_000093005.1_ASM9300v1.gff.gz >> genomes.gbff.h3_and_GFFplasmidNoSeq.list
        echo GFF_plasmid_No_seq$'\t'GBFF/plasmid_GCF_000093005.1_ASM9300v1.fna.gz >> genomes.fna.GFFplasmidNoSeq.list
        ppanggolin projection -p myannopang/pangenome.h5 --anno genomes.gbff.h3_and_GFFplasmidNoSeq.list --fasta  genomes.fna.GFFplasmidNoSeq.list

    - name: Archive diff files
      uses: actions/upload-artifact@v4
      with:
        name: comparison-results_${{ matrix.os }}_python${{ matrix.python-version }}
        path: testingDataset/info_to_test/*